commit 1e16d84d60f92233d14a3565147618943e6f7f6d7881e7325a2327fc277cc583 Author: Adrian Schröter Date: Fri May 3 12:17:51 2024 +0200 Sync from SUSE:SLFO:Main eigen3 revision 967228583ad2815e736b681bec3007c3 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9b03811 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,23 @@ +## Default LFS +*.7z filter=lfs diff=lfs merge=lfs -text +*.bsp filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.gem filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.jar filter=lfs diff=lfs merge=lfs -text +*.lz filter=lfs diff=lfs merge=lfs -text +*.lzma filter=lfs diff=lfs merge=lfs -text +*.obscpio filter=lfs diff=lfs merge=lfs -text +*.oxt filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.rpm filter=lfs diff=lfs merge=lfs -text +*.tbz filter=lfs diff=lfs merge=lfs -text +*.tbz2 filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.ttf filter=lfs diff=lfs merge=lfs -text +*.txz filter=lfs diff=lfs merge=lfs -text +*.whl filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text diff --git a/0001-Disable-Altivec-for-ppc64le.patch b/0001-Disable-Altivec-for-ppc64le.patch new file mode 100644 index 0000000..9b4bad0 --- /dev/null +++ b/0001-Disable-Altivec-for-ppc64le.patch @@ -0,0 +1,22 @@ +From e4b0115a362a35a3ac6eacca9fbd7f57e7c02fb4 Mon Sep 17 00:00:00 2001 +From: Dinar Valeev +Date: Wed, 30 Jul 2014 17:55:03 +0200 +Subject: [PATCH] Disable Altivec for ppc64le + +--- + Eigen/Core | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +Index: eigen-3.4.0/Eigen/src/Core/util/ConfigureVectorization.h +=================================================================== +--- eigen-3.4.0.orig/Eigen/src/Core/util/ConfigureVectorization.h ++++ eigen-3.4.0/Eigen/src/Core/util/ConfigureVectorization.h +@@ -374,7 +374,7 @@ + #undef vector + #undef pixel + +- #elif defined __ALTIVEC__ ++ #elif defined __ALTIVEC__ && _CALL_ELF != 2 + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ALTIVEC diff --git a/0001-Do-stack-allignment-on-ppc.patch b/0001-Do-stack-allignment-on-ppc.patch new file mode 100644 index 0000000..c468d71 --- /dev/null +++ b/0001-Do-stack-allignment-on-ppc.patch @@ -0,0 +1,22 @@ +From 91025c823045259bad2297850625a08fbf986043 Mon Sep 17 00:00:00 2001 +From: Dinar Valeev +Date: Thu, 31 Jul 2014 12:51:13 +0000 +Subject: [PATCH] Do stack allignment on ppc + +--- + Eigen/src/Core/util/Macros.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +Index: eigen-3.4.0/Eigen/src/Core/util/ConfigureVectorization.h +=================================================================== +--- eigen-3.4.0.orig/Eigen/src/Core/util/ConfigureVectorization.h ++++ eigen-3.4.0/Eigen/src/Core/util/ConfigureVectorization.h +@@ -103,7 +103,7 @@ + // certain common platform (compiler+architecture combinations) to avoid these problems. + // Only static alignment is really problematic (relies on nonstandard compiler extensions), + // try to keep heap alignment even when we have to disable static alignment. +- #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) ++ #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) + // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. diff --git a/_constraints b/_constraints new file mode 100644 index 0000000..9b40922 --- /dev/null +++ b/_constraints @@ -0,0 +1,16 @@ + + + + + eigen3:docs + + + + 5 + + + 7 + + + + diff --git a/_multibuild b/_multibuild new file mode 100644 index 0000000..8526d3d --- /dev/null +++ b/_multibuild @@ -0,0 +1,3 @@ + + docs + diff --git a/eigen-3.4.0.tar.bz2 b/eigen-3.4.0.tar.bz2 new file mode 100644 index 0000000..a95645e --- /dev/null +++ b/eigen-3.4.0.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626 +size 2143091 diff --git a/eigen3-fix-forward_adolc-unit-test.patch b/eigen3-fix-forward_adolc-unit-test.patch new file mode 100644 index 0000000..7f494e3 --- /dev/null +++ b/eigen3-fix-forward_adolc-unit-test.patch @@ -0,0 +1,18 @@ +Index: eigen-3.3.8/unsupported/test/forward_adolc.cpp +=================================================================== +--- eigen-3.3.8.orig/unsupported/test/forward_adolc.cpp ++++ eigen-3.3.8/unsupported/test/forward_adolc.cpp +@@ -7,12 +7,12 @@ + // Public License v. 2.0. If a copy of the MPL was not distributed + // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +-#include "main.h" + #include + + #define NUMBER_DIRECTIONS 16 + #include + ++#include "main.h" + template + EIGEN_DONT_INLINE typename Vector::Scalar foo(const Vector& p) + { diff --git a/eigen3-googlehash-detection.patch b/eigen3-googlehash-detection.patch new file mode 100644 index 0000000..a2ea50b --- /dev/null +++ b/eigen3-googlehash-detection.patch @@ -0,0 +1,13 @@ +Index: eigen-3.3.8/cmake/FindGoogleHash.cmake +=================================================================== +--- eigen-3.3.8.orig/cmake/FindGoogleHash.cmake ++++ eigen-3.3.8/cmake/FindGoogleHash.cmake +@@ -12,6 +12,8 @@ find_path(GOOGLEHASH_INCLUDES + + if(GOOGLEHASH_INCLUDES) + # let's make sure it compiles with the current compiler ++ # C++11 is needed for googlehash ++ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + file(WRITE ${CMAKE_BINARY_DIR}/googlehash_test.cpp + "#include \n#include \nint main(int argc, char** argv) { google::dense_hash_map a; google::sparse_hash_map b; return 0;}\n") + try_compile(GOOGLEHASH_COMPILE ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}/googlehash_test.cpp OUTPUT_VARIABLE GOOGLEHASH_COMPILE_RESULT) diff --git a/eigen3.changes b/eigen3.changes new file mode 100644 index 0000000..c29e8df --- /dev/null +++ b/eigen3.changes @@ -0,0 +1,825 @@ +------------------------------------------------------------------- +Tue Feb 1 23:06:35 UTC 2022 - Stefan Brüns + +- Add _constraints for docs(+test) to avoid OOM build failures +- Drop obsolete/unnecessary patches + * eigen3-3.3.1-fixcmake.patch (no longer has any effect) + * 01_install_FindEigen3.patch (CMake Config mode is preferred) +- Fix build for ppc64le (affects test and dependent packages, e.g. + arpack-ng), add fix_ppc64le_always_inline_680.patch + +------------------------------------------------------------------- +Fri Aug 20 01:01:50 UTC 2021 - Atri Bhattacharya + +- Update to version 3.4.0 + * Long list of changes, see + . +- Drop patches incorporated or otherwise fixed upstream: + * eigen3-CastXML-support-for-aarch64.patch. + * eigen3-make-sparseqr-unit-test-stable.patch. + * eigen3-failtests-handling.patch. +- Rebase following patches for updated sources: + * 0001-Disable-Altivec-for-ppc64le.patch. + * 0001-Do-stack-allignment-on-ppc.patch. +- Minor line offset adjustments of other patches to apply cleanly. + +------------------------------------------------------------------- +Tue Dec 8 19:27:09 UTC 2020 - Atri Bhattacharya + +- Update to version 3.3.9: + * Introduce rendering Doxygen math formulas with MathJax and the + option EIGEN_DOC_USE_MATHJAX to control this. + * Issue 1746: Removed implementation of standard + copy-constructor and standard copy-assign-operator from + PermutationMatrix and Transpositions to allow malloc-less + std::move. + * Issue 2036: Make sure the + find_standard_math_library_test_program compiles and doesn't + optimize away functions we try to test for. + * Issue 2046: Rename test/array.cpp to test/array_cwise.cpp to + fix an issue with the C++ standard library header "array" + * Issue 2040: Fix an issue in test/ctorleak that occured when + disabling exceptions. + * Issue 2011: Remove error counting in OpenMP parallel section + in Eigen's GEMM parallelizing logic. + * Issue 2012: Define coeff-wise binary array operators for base + class to fix an issue when using Eigen with C++20 + * Fix an issue with Intel® MKL PARDISO support. +- Drop Remove-error-counting-in-OpenMP-parallelize_gemm.patch: + incorporated upstream. +- Add patches to fix building and running tests: + * eigen3-failtests-handling.patch: Simplify handling of tests + that must fail to compile; upstream commit: 72c0bbe. + * eigen3-make-sparseqr-unit-test-stable.patch: Make sparseqr + test more stable to prevent random failures; patch taken from + upstream commit + [https://gitlab.com/libeigen/eigen/-/issues/899]. + * eigen3-googlehash-detection.patch: GoogleHash needs C++11 std + to compile test code and be succesfully detected. + * eigen3-fix-forward_adolc-unit-test.patch: Prevent conflict of + std::min/max with eigen's macros by importing eigen + test-suite's main.h header only after all system headers have + been included. +- Setup but don't run tests due to random errors in test-suite + itself + * Pass EIGEN_TEST_CXX11:Bool=ON and EIGEN_TEST_OPENMP:Bool=ON to + cmake to enable building c++11 and openmp tests. + * export EIGEN_SEED=100 to build with fixed seed (see + https://gitlab.com/libeigen/eigen/-/issues/2088). + * export EIGEN_REPEAT=1 to run each test only once (default 10) + to save time and since we use a fixed seed anyway. + * Disable for non-86_64 archs as some unsupported tests fail for + the others (for i586 some supported tests too, see + https://gitlab.com/libeigen/eigen/-/issues/2092). +- Pass CMAKE_SKIP_RPATH:BOOL=OFF and + CMAKE_SKIP_INSTALL_RPATH:BOOL=ON to cmake to fix rpath handling + in Leap 15.2's cmake macros (doesn't hurt generally). +- Change CMAKE_BUILD_TYPE from Release to RelWithDebInfo (openSUSE + default). + +------------------------------------------------------------------- +Wed Nov 4 13:27:10 UTC 2020 - Atri Bhattacharya + +- Drop eigen_pkgconfig.patch: the includedir now needs to be + specified relative to CMAKE_INSTALL_PREFIX, so this patch causes + the includedir in the pkgconfig file to be erroneous + (boo#1178139). +- Use %autosetup to automatically run through patches. + +------------------------------------------------------------------- +Sun Oct 25 21:45:11 UTC 2020 - Stefan Brüns + +- Fix compilation error when using Eigen3 with OpenMP: + * Remove-error-counting-in-OpenMP-parallelize_gemm.patch + +------------------------------------------------------------------- +Thu Oct 15 13:23:17 UTC 2020 - Atri Bhattacharya + +- Update to version 3.3.8: + * General bug fixes: + - Issue #1995: Fix a failure in the GEBP kernel when using + small L1 cache sizes, OpenMP and FMA. + - Issue #1990: Make CMake accept installation paths relative + to CMAKE_INSTALL_PREFIX. + - Issue #1974: Fix issue when reserving an empty sparse matrix + - Issue #1823: Fix incorrect use of std::abs + - Issue #1788: Fix rule-of-three violations inside the stable + modules. This fixes deprecated-copy warnings when compiling + with GCC>=9. Also protect some additional Base-constructors + from getting called by user code code (Issue #1587) + - Issue #1796: Make matrix squareroot usable for Map and Ref + types. + - Issue #1281: Fix AutoDiffScalar's make_coherent for nested + expression of constant ADs. + - Issue #1761: Fall back is_integral to std::is_integral in + c++11 and fix internal::is_integral with + MSVC 2013 and older. + - Issue #1741: Fix self-adjoint*matrix, triangular*matrix, and + triangular^1*matrix with a destination having a non-trivial + inner-stride. + - Issue #1741: Fix SelfAdjointView::rankUpdate and product to + triangular part for destination with non-trivial inner + stride. + - Issue #1741: Fix C.noalias() = A*C; with C.innerStride()!=1. + - Issue #1695: Fix a numerical robustness issue in BDCSVD. + - Issue #1692: Enable enum as sizes of Matrix and Array. + - Issue #1689: Fix used-but-marked-unused warning. + - Issue #1679: Avoid possible division by 0 in complex-schur. + - Issue #1676: Fix C++17 template deduction in DenseBase. + - Issue #1669: Fix PartialPivLU/inverse with zero-sized + matrices. + - Issue #1557: Fix RealSchur and EigenSolver for matrices with + only zeros on the diagonal. + * Performance related fixes: + - Issue #1562: Optimize evaluation of small products of the + form s*A*B by rewriting them as: s*(A.lazyProduct(B)) to + save a costly temporary. Measured speedup from 2x to 5x... + - Fix performance issue with SimplicialLDLT for complexes + coefficients. + * Misc fixes: + - Fix QuaternionBase::cast for quaternion map and wrapper. + - Fix case issue with Lapack unit tests. + - Fix possible conflict with an externally defined "real" type + when using gcc-5. + - Provide numext::[u]int{32,64}_t. + - Initialize isometric transforms like affine transforms. + - Change typedefs from private to protected to fix MSVC + compilation. + - Fix compilation of FFTW unit test. + - Fix compilation of BLAS backend and frontend. + - Fix real/imag namespace conflict. + - Avoid throwing in destructors. + - Fix precision issue in SelfAdjointEigenSolver.h + - Make digits10() return an integer. + - Use pade for matrix exponential also for complex values. + - Cast Index to RealScalar in SVDBase to fix an issue when + RealScalar is not implicitly convertible to Index. + - Provide EIGEN_HAS_C99_MATH when using MSVC. + - Various fixes in polynomial solver and its unit tests. + - nd 4415d4e2d: Extend polynomial solver unit tests to + complexes. + - Automatically switch between EigenSolver and + ComplexEigenSolver, and fix a few Real versus Scalar issues. + - Enable construction of Ref from a runtime + vector. + - Fix a problem of old gcc versions having problems with + recursive #pragma GCC diagnostic push/pop. + - Fix compilation with expression template scalar type. + - Backport AVX512 implementation to 3.3. + - Fix StlDeque compilation issue with GCC 10. + - Avoid false-positive test results in non-linear optimization + tests. + - Fix undefined behaviour caused by uncaught exceptions in OMP + section of parallel GEBP kernel. + - Fix a bug with half-precision floats on GPUs. + * Fixed warnings: + - Fix some maybe-uninitialized warnings in AmbiVector.h and + test bdcsvd. + - Silent cmake warnings in Lapack CMakeLists.txt. + - Rename variable which shadows class name in Polynomials + module. + - Workaround gcc's alloc-size-larger-than= warning in + DenseStorage.h. + - Hide some unused variable warnings in g++8.1 in Tensor + contraction mapper. + - Fix gcc 8.1 warning: "maybe use uninitialized" in std tests. + - Fix always true warning with gcc 4.7in test numext. + - Fix nonnull-compare warning in test geo_alignedbox. + - Disable ignoring attributes warning in vectorization logic + test. + - Fix a gcc7 warning about bool * bool in abs2 default + implementation. + - Fix a warning in SparseSelfAdjointView about a branch + statement always evaluation to false. +- Minor rebase of patches to fix offsets. +- Specify relative path to includedir as option to cmake as it + only accepts paths relative to CMAKE_INSTALL_PREFIX now. +- Drop Group tags. + +------------------------------------------------------------------- +Thu Sep 17 12:23:42 UTC 2020 - Atri Bhattacharya + +- Add eigen3-CastXML-support-for-aarch64.patch to support CastXML + on aarch64 [https://gitlab.com/libeigen/eigen/-/issues/1979]; + patch taken from upstream commit and rebased to apply without + fuzz. + +------------------------------------------------------------------- +Mon Aug 5 23:28:16 UTC 2019 - Stefan Brüns + +- Convert to _multibuild, to avoid doc dependencies when building + the main package. + +------------------------------------------------------------------- +Wed Jul 24 15:48:27 UTC 2019 - Stefan Brüns + +- Clean up spec file: + * Correct License tag, some included algorithms are LGPL-2.1-only + * Remove conditionals for obsolete distributions (which were failing + or unresolvable anyway) + * Remove defattr, use license macro + * Hide GL/GLUT/GLEW BuildRequires behind tests bcond, as tests are + not build currently + * Add missing BuildRequires tex(newunicodechar.sty), group doc dependencies + +------------------------------------------------------------------- +Tue Feb 26 19:32:49 UTC 2019 - Bernhard Wiedemann + +- Drop doc tgz that contained duplicated files + to make the package build reproducible (boo#1047218) + +------------------------------------------------------------------- +Sat Dec 29 00:22:34 UTC 2018 - Christoph Junghans + +- Update to version 3.3.7: + + Changes since 3.3.6: Fix compilation with GCC>=6 and compiler optimization turned off. +- Added eigen3-3.3.1-fixcmake.patch + +------------------------------------------------------------------- +Tue Nov 7 13:30:09 UTC 2017 - alarrosa@suse.com + +- Remove libqt4 from BuildRequires since it's actually only required + by some demos and tests which are simply not built when it's not + available (except on Leap, where libqt4 is required) + +------------------------------------------------------------------- +Sat Aug 12 16:10:58 UTC 2017 - jengelh@inai.de + +- Use right RPM group. + +------------------------------------------------------------------- +Wed Jul 18 03:00:00 UTC 2017 - cfeck@kde.org + +- Split documentation into its own -doc package due to size. + +------------------------------------------------------------------- +Wed Jul 5 20:26:17 UTC 2017 - asterios.dramis@gmail.com + +- Update to version 3.3.4: + General: + * Improve speed of Jacobi rotation when mixing complex and real + types. + * Bug 1405: enable StrictlyLower/StrictlyUpper triangularView as + the destination of matrix*matrix products. + * UmfPack support: enable changes in the control settings and add + report functions. + * Bug 1423: fix LSCG's Jacobi preconditioner for row-major + matrices. + * Bug 1424: fix compilation issue with abs and unsigned integers + as scalar type. + * Bug 1410: fix lvalue propagation of Array/Matrix-Wrapper with a + const nested expression. + * Bug 1403: fix several implicit scalar type conversion making + SVD decompositions compatible with ceres::Jet. + * Fix some real-to-scalar-to-real useless conversions in + ColPivHouseholderQR. + Regressions: + * Fix dense * sparse-selfadjoint-view product. + * Bug 1417: make LinSpace compatible with std::complex. + * Bug 1400: fix stableNorm alignment issue with + EIGEN_DONT_ALIGN_STATICALLY. + * Bug 1411: fix alignment issue in Quaternion. + * Fix compilation of operations between nested Arrays. + * Bug 1435: fix aliasing issue in expressions like: A = C - B*A. + Others: + * Fix compilation with gcc 4.3 and ARM NEON. + * Fix prefetches on ARM64 and ARM32. + * Fix out-of-bounds check in COLAMD. + * Few minor fixes regarding nvcc/CUDA support, including bug + 1396. + * Improve cmake scripts for Pastix and BLAS detection. + * Bug 1401: fix compilation of "cond ? x : -x" with x an + AutoDiffScalar + * Fix compilation of matrix log with Map as input. + * Add specializations of std::numeric_limits for Eigen::half and + and AutoDiffScalar + * Fix compilation of streaming nested Array, i.e., + cout << Array> + +------------------------------------------------------------------- +Tue Mar 7 21:57:13 UTC 2017 - asterios.dramis@gmail.com + +- Update to version 3.3.3: + * Lots of changes. See + http://eigen.tuxfamily.org/index.php?title=ChangeLog#Eigen_3.3.3 + for details. +- Added new build requirements libboost_headers-devel for + Tumbleweed and boost-devel for openSUSE <= 13.2. +- Rebased 0001-Disable-Altivec-for-ppc64le.patch and + 0001-Do-stack-allignment-on-ppc.patch to apply cleanly. + +------------------------------------------------------------------- +Wed Jan 4 08:49:33 UTC 2017 - olaf@aepfle.de + +- Require texlive-dvips during build + +------------------------------------------------------------------- +Sun Aug 7 20:31:43 UTC 2016 - asterios.dramis@gmail.com + +- Update to version 3.2.9: + Main fixes and improvements: + * Improve numerical robustness of JacobiSVD (backported from 3.3) + * Bug 1017: prevents underflows in makeHouseholder + * Fix numerical accuracy issue in the extraction of complex + eigenvalue pairs in real generalized eigenvalue problems. + * Fix support for vector.homogeneous().asDiagonal() + * Bug 1238: fix SparseMatrix::sum() overload for un-compressed + mode + * Bug 1213: workaround gcc linking issue with anonymous enums. + * Bug 1236: fix possible integer overflow in sparse-sparse + product + * Improve detection of identical matrices when applying a + permutation (e.g., mat = perm * mat) + * Fix usage of nesting type in blas_traits. In practice, this + fixes compilation of expressions such as A*(A*A)^T + * CMake: fixes support of Ninja generator + * Add a StorageIndex typedef to sparse matrices and expressions + to ease porting code to 3.3 + (see http://eigen.tuxfamily.org/index.php?title=3.3#Index_typedef) + * Bug 1200: make aligned_allocator c++11 compatible (backported + from 3.3) + * Bug 1182: improve generality of abs2 (backported from 3.3) + * Bug 537: fix compilation of Quaternion with Apples's compiler + * Bug 1176: allow products between compatible scalar types + * Bug 1172: make valuePtr and innerIndexPtr properly return null + for empty sparse matrices. + * Bug 1170: skip calls to memcpy/memmove for empty inputs. + Others: + * Bug 1242: fix comma initializer with empty matrices. + * Improves support for MKL's PARDISO solver. + * Fix a compilation issue with Pastix solver. + * Add some missing explicit scalar conversions + * Fix a compilation issue with matrix exponential (unsupported + MatrixFunctions module). + * Bug 734: fix a storage order issue in unsupported Spline module + * Bug 1222: fix a compilation issue in AutoDiffScalar + * Bug 1221: shutdown some GCC6's warnings. + * Bug 1175: fix index type conversion warnings in sparse to dense + conversion. +- Removed build requirements gnu-free-fonts and texlive-amsfonts + (not needed anymore). + +------------------------------------------------------------------- +Thu Jun 30 20:46:12 UTC 2016 - asterios.dramis@gmail.com + +- Update to version 3.2.8: + Main fixes and improvements: + * Make FullPivLU::solve use rank() instead of nonzeroPivots(). + * Add EIGEN_MAPBASE_PLUGIN + * Bug 1166: fix issue in matrix-vector products when the + destination is not a vector at compile-time. + * Bug 1100: Improve cmake/pkg-config support. + * Bug 1113: fix name conflict with C99's "I". + * Add missing delete operator overloads in + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + * Fix (A*B).maxCoeff(i) and similar. + * Workaround an ICE with VC2015 Update1 x64. + * Bug 1156: fix several function declarations whose arguments + were passed by value instead of being passed by reference + * Bug 1164: fix std::list and std::deque specializations such + that our aligned allocator is automatically activatived only + when the user did not specified an allocator (or specified the + default std::allocator). + Others: + * Fix BLAS backend (aka MKL) for empty matrix products. + * Bug 1134: fix JacobiSVD pre-allocation. + * Bug 1111: fix infinite recursion in + sparse-column-major.row(i).nonZeros() (it now produces a + compilation error) + * Bug 1106: workaround a compilation issue in Sparse module for + msvc-icc combo + * Bug 1153: remove the usage of __GXX_EXPERIMENTAL_CXX0X__ to + detect C++11 support + * Bug 1143: work-around gcc bug in COLAMD + * Improve support for matrix products with empty factors. + * Fix and clarify documentation of Transform wrt + operator*(MatrixBase) + * Add a matrix-free conjugate gradient example. + * Fix cost computation in CwiseUnaryView (internal) + * Remove custom unaligned loads for SSE. + * Some warning fixes. + * Several other documentation clarifications. +- Updated build requirement superlu to superlu-devel. +- Added a patch "eigen_pkgconfig.patch" to fix pkg-config file + includedir (taken from Fedora). +- Added a patch "01_install_FindEigen3.patch" to install + FindEigen3.cmake (taken from Fedora). + +------------------------------------------------------------------- +Fri Nov 27 13:31:47 UTC 2015 - Rene.vanPaassen@gmail.com + +- Specify eigen header install dir; otherwise the pkgconfig file + defaults to -Iinclude/eigen3 + +------------------------------------------------------------------- +Mon Nov 16 13:14:52 UTC 2015 - p.drouand@gmail.com + +- Update to version 3.2.7 + * Add support for dense.cwiseProduct(sparse). + * Fix a regression regarding (dense*sparse).diagonal(). + * Make the IterativeLinearSolvers module compatible with MPL2-only + mode by defaulting to COLAMDOrdering and NaturalOrdering for ILUT + and ILLT respectively. + * Bug 266: backport support for c++11 move semantic + * operator/=(Scalar) now performs a true division (instead of mat*(1/s)) + * Improve numerical accuracy in LLT and triangular solve by using + true scalar divisions (instead of mat * (1/s)) + * Bug 1092: fix iterative solver constructors for expressions as input + * Bug 1088: fix setIdenity for non-compressed sparse-matrix + * Bug 1086: add support for recent SuiteSparse versions + * Add overloads for real-scalar times SparseMatrix operations. + This avoids real to complex conversions, and also fixes a compilation + issue with MSVC. + * Use explicit Scalar types for AngleAxis initialization + * Fix several shortcomings in cost computation (avoid multiple + re-evaluation in some very rare cases). + * Bug 1090: fix a shortcoming in redux logic for which + slice-vectorization plus unrolling might happen. + * Fix compilation issue with MSVC by backporting + DenseStorage::operator= from devel branch. + * Bug 1063: fix nesting of unsupported/AutoDiffScalar to prevent + dead references when computing second-order derivatives + * Bug 1100: remove explicit CMAKE_INSTALL_PREFIX prefix to conform + to cmake install's DESTINATION parameter. + * unsupported/ArpackSupport is now properly installed by make install. + * Bug 1080: warning fixes +- Changes from version 3.2.6 + * fix some compilation issues with MSVC 2013, including bugs 1000 and 1057 + * SparseLU: fixes to support EIGEN_DEFAULT_TO_ROW_MAJOR (bug 1053), and + for empty (bug 1026) and some structurally rank deficient matrices (bug 792) + * Bug 1075: fix AlignedBox::sample() for Dynamic dimension + * fix regression in AMD ordering when a column has only one off-diagonal + non-zero (used in sparse Cholesky) + * fix Jacobi preconditioner with zero diagonal entries + * fix Quaternion identity initialization for non-implicitly convertible types + * Bug 1059: fix predux_max for NEON + * Bug 1039: fix some issues when redefining EIGEN_DEFAULT_DENSE_INDEX_TYPE + * Bug 1062: fix SelfAdjointEigenSolver for RowMajor matrices + * MKL: fix support for the 11.2 version, and fix a naming conflict (bug 1067) + * Bug 1033: explicit type conversion from 0 to RealScalar +- + +------------------------------------------------------------------- +Sat Sep 5 08:10:52 UTC 2015 - mpluskal@suse.com + +- Update to 3.2.5 + * Changes with main impact: + + Improve robustness of SimplicialLDLT to semidefinite problems + by correctly handling structural zeros in AMD reordering + + Re-enable supernodes in SparseLU (fix a performance + regression in SparseLU) + + Use zero guess in ConjugateGradients::solve + + Add PermutationMatrix::determinant method + + Fix SparseLU::signDeterminant() method, and add a + SparseLU::determinant() method + + Allows Lower|Upper as a template argument of CG and MINRES: + in this case the full matrix will be considered + + Bug 872: remove usage of std::bind* functions (deprecated in + c++11) + * Numerical robustness improvements: + + Bug 1014: improve numerical robustness of the 3x3 direct + eigenvalue solver + + Bug 1013: fix 2x2 direct eigenvalue solver for identical + eigenvalues + + Bug 824: improve accuracy of Quaternion::angularDistance + + Bug 941: fix an accuracy issue in ColPivHouseholderQR by + continuing the decomposition on a small pivot + + Bug 933: improve numerical robustness in RealSchur + + Fix default threshold value in SPQR + * Other changes: + + Fix usage of EIGEN_NO_AUTOMATIC_RESIZING + + Improved support for custom scalar types in SparseLU + + Improve cygwin compatibility + + Bug 650: fix an issue with sparse-dense product and + rowmajor matrices + + Bug 704: fix MKL support (HouseholderQR) + + Bug 705: fix handling of Lapack potrf return code (LLT) + + Bug 714: fix matrix product with OpenMP support + + Bug 949: add static assertions for incompatible scalar + types in many of the dense decompositions + + Bugs 957, 1000: workaround MSVC/ICC compilation issues when + using sparse blocks + + Bug 969: fix ambiguous calls to Ref + + Bugs 972, 986: add support for coefficient-based product + with 0 depth + + Bug 980: fix taking a row (resp. column) of a column-major + (resp. row-major) sparse matrix + + Bug 983: fix an alignement issue in Quaternion + + Bug 985: fix RealQZ when either matrix had zero rows or + columns + + Bug 987: fix alignement guess in diagonal product + + Bug 993: fix a pitfall with matrix.inverse() + + Bugs 996, 1016: fix scalar conversions + + Bug 1003: fix handling of pointers non aligned on scalar + boundary in slice-vectorization + + Bug 1010: fix member initialization in IncompleteLUT + + Bug 1012: enable alloca on Mac OS or if alloca is defined + as macro + + Doc and build system: 733, 914, 952, 961, 999 +- Use cmake macros +- Use url for source +- Cleanup spec file with spec-cleaner +- Remove conditional buildrequires for releases which did not + build anyway + +------------------------------------------------------------------- +Wed Apr 22 20:41:28 UTC 2015 - asterios.dramis@gmail.com + +- Update to version 3.2.4: + * Fix compilation regression in Rotation2D + * Bug 920: fix compilation issue with MSVC 2015. + * Bug 921: fix utilization of bitwise operation on enums in + first_aligned. + * Fix compilation with NEON on some platforms. + From version 3.2.3: + Core: + * Enable Mx0 * 0xN matrix products. + * Bug 859: fix returned values for vectorized versions of + exp(NaN), log(NaN), sqrt(NaN) and sqrt(-1). + * Bug 879: tri1 = mat * tri2 was compiling and running + incorrectly if tri2 was not numerically triangular. Workaround + the issue by evaluating mat*tri2 into a temporary. + * Bug 854: fix numerical issue in + SelfAdjointEigenSolver::computeDirect for 3x3 matrices. + * Bug 884: make sure there no call to malloc for zero-sized + matrices or for a Ref<> without temporaries. + * Bug 890: fix aliasing detection when applying a permutation. + * Bug 898: MSVC optimization by adding inline hint to + const_cast_ptr. + * Bug 853: remove enable_if<> in Ref<> ctor. + Dense solvers: + * Bug 894: fix the sign returned by LDLT for multiple calls to + compute(). + * Fix JacobiSVD wrt underflow and overflow. + * Bug 791: fix infinite loop in JacobiSVD in the presence of + NaN. + Sparse: + * Fix out-of-bounds memory write when the product of two sparse + matrices is completely dense and performed using pruning. + * UmfPack support: fix redundant evaluation/copies when calling + compute(), add support for generic expressions as input, and + fix extraction of the L and U factors (Bug 911). + * Improve SparseMatrix::block for const matrices (the generic + path was used). + * Fix memory pre-allocation when permuting inner vectors of a + sparse matrix. + * Fix SparseQR::rank for a completely empty matrix. + * Fix SparseQR for row-major inputs. + * Fix SparseLU::absDeterminant and add respective unit test. + * BiCGSTAB: make sure that good initial guesses are not + destroyed by a bad preconditioner. + Geometry: + * Fix Hyperplane::Through(a,b,c) when points are aligned or + identical. + * Fix linking issues in OpenGLSupport. + OS, build system and doc: + * Various compilation fixes including: bug 821, bug 822, + bug 857, bug 871, bug 873. + * Fix many compilation warnings produced by recent compilers + including: bug 909. + * Bug 861: enable posix_memalign with PGI. + * Fix BiCGSTAB doc example. + +------------------------------------------------------------------- +Sat Aug 9 21:02:38 UTC 2014 - asterios.dramis@gmail.com + +- Update to version 3.2.2: + Core: + * Relax Ref such that Ref accepts a RowVectorXf which can be seen + as a degenerate MatrixXf(1,N) + * Fix performance regression for the vectorization of sub columns/rows of + matrices. + * EIGEN_STACK_ALLOCATION_LIMIT: Raise its default value to 128KB, make use + of it to assert on maximal fixed size object, and allows it to be 0 to + mean "no limit". + * Bug 839: Fix 1x1 triangular matrix-vector product. + * Bug 755: CommaInitializer produced wrong assertions in absence of + Return-Value-Optimization. + Dense solvers: + * Add a rank() method with threshold control to JacobiSVD, and make solve + uses it to return the minimal norm solution for rank-deficient problems. + * Various numerical fixes in JacobiSVD, including:bug 843, and the move from + Lapack to Matlab strategy for the default threshold. + * Various numerical fixes in LDLT, including the case of semi-definite + complex matrices. + * Fix ColPivHouseholderQR::rank(). + * Bug 222: Make temporary matrix column-major independently of + EIGEN_DEFAULT_TO_ROW_MAJOR in BlockHouseholder. + Sparse: + * http://eigen.tuxfamily.org/bz/show_bug.cgi?id=838 Bug 838]: Fix dense + * sparse and sparse * dense outer products and detect outer products from + either the lhs or rhs. + * Make the ordering method of SimplicialL[D]LT configurable. + * Fix regression in the restart mechanism of BiCGSTAB. + * Bug 836: extend SparseQR to support more columns than rows. + * Bug 808: Use double instead of float for the increasing size ratio in + CompressedStorage::resize, fix implicit conversions from int/longint to + float/double, and fix set_from_triplets temporary matrix type. + * Bug 647: Use smart_copy instead of bitwise memcpy in CompressedStorage. + * GMRES: Initialize essential Householder vector with correct dimension. + Geometry: + * Bug 807: Missing scalar type cast in umeyama() + * Bug 806: Missing scalar type cast in Quaternion::setFromTwoVectors() + * Bug 759: Removed hard-coded double-math from Quaternion::angularDistance. + OS, build system and doc: + * Fix compilation with Windows CE. + * Fix some ICEs with VC11. + * Check IMKL version for compatibility with Eigen + * Bug 754: Only inserted (!defined(_WIN32_WCE)) analog to alloc and free + implementation. + * Bug 803: Avoid char* to int* conversion. + * Bug 819: Include path of details.h file. + * Bug 738: Use the "current" version of cmake project directories to ease + the inclusion of Eigen within other projects. + * Bug 815: Fix doc of FullPivLU wrt permutation matrices. + * Bug 632: doc: Note that dm2 = sm1 + dm1 is not possible + * Extend AsciiQuickReference (real, imag, conjugate, rot90) +- Added metis-devel build requirement for openSUSE > 13.1. + +------------------------------------------------------------------- +Thu Jul 31 12:52:18 UTC 2014 - dvaleev@suse.com + +- Allign the stack on powerpc + +- added patches: + * 0001-Do-stack-allignment-on-ppc.patch +------------------------------------------------------------------- +Wed Jul 30 15:55:52 UTC 2014 - dvaleev@suse.com + +- Disable altivec on ppc64le. eigen3 have no LE altivec inplemented + yet. + +- added patches: + * 0001-Disable-Altivec-for-ppc64le.patch +------------------------------------------------------------------- +Thu Apr 10 21:05:43 UTC 2014 - asterios.dramis@gmail.com + +- Update to version 3.2.1: + * Eigen2 support is now deprecated and will be removed in version 3.3. + Core: + * Bug fix for Ref object containing a temporary matrix. + * Bug 654: Allow construction of row vector from 1D array. + * Bug 679: Support cwiseMin() and cwiseMax() on maps. + * Support conservativeResize() on vectors. + * Improve performance of vectorwise and replicate expressions. + * Bug 642: Add vectorization of sqrt for doubles, and make sqrt really safe + if EIGEN_FAST_MATH is disabled. + * Bug 616: Try harder to align columns when printing matrices and arrays. + * Bug 579: Add optional run-time parameter to fixed-size block methods. + * Implement .all() and .any() for zero-sized objects + * Bug 708: Add placement new and delete for arrays. + * Bug 503: Better C++11 support. + Dense linear algebra: + * Bug 689: Speed up some matrix-vector products by using aligned loads if + possible. + * Make solve in FullPivHouseholderQR return least-square solution if there + is no exact solution. + * Bug 678: Fix fullPivHouseholderQR for rectangular matrices. + * Fix a 0/0 issue in JacobiSVD. + * Bug 736: Wrong result in LDLT::isPositiveDefinite() for semi-definite + matrices. + * Bug 740: Fix overflow issue in stableNorm(). + * Make pivoting HouseholderQR compatible with custom scalar types. + Geometry: + * Fix compilation of Transform * UniformScaling + Sparse matrices: + * Fix elimination tree and SparseQR for fat rectangular matrices. + * Bug 635: add isCompressed to MappedSparseMatrix for compatibility. + * Bug 664: Support iterators without operator< in setFromTriplets(). + * Fixes in SparseLU: infinite loop, aliasing issue when solving, overflow in + memory allocation, use exceptions only if enabled (bug 672). + * Fixes in SparseQR: reduce explicit zero, assigning result to map, assert + catching non-conforming sizes, memory leak. + * Bug 681: Uninitialized value in CholmodSupport which may lead to incorrect + results. + * Fix some issues when using a non-standard index type (bug 665 and more) + * Update constrained CG (unsupported module) to Eigen3. + OS and build system: + * MacOS put OpenGL header files somewhere else from where we expected it. + * Do not assume that alloca() is 16-byte aligned on Windows. + * Compilation fixes when using ICC with Visual Studio. + * Fix Fortran compiler detection in CMake files. + * Fix some of our tests (bugs 744 and 748 and more). + * Fix a few compiler warnings (bug 317 and more). + * Documentation fixes (bugs 609, 638 and 739 and more). +- Renamed the package from libeigen3-devel to eigen3. Added eigen3-devel + subpackage with the necessary Provides/Obsoletes entries for libeigen3-devel. + +------------------------------------------------------------------- +Mon Aug 12 21:23:59 UTC 2013 - asterios.dramis@gmail.com + +- Update to version 3.2.0: + * See http://eigen.tuxfamily.org/index.php?title=ChangeLog#Eigen_3.2.0 for + changes. +- Removed eigen-3.1.2-remove-include-of-removed-header-file.patch (fixed + upstream). + +------------------------------------------------------------------- +Sun May 19 17:53:45 UTC 2013 - asterios.dramis@gmail.com + +- Update to version 3.1.3: + * Bug 526 - Fix linear vectorized transversal in linspace. + * Bug 551 - Fix compilation issue when using EIGEN_DEFAULT_DENSE_INDEX_TYPE. + * Bug 533 - Fix some missing const qualifiers in Transpose + * Fix a compilation with CGAL::Gmpq by adding explicit internal:: namespace + when calling abs(). + * Fix computation of outer-stride when calling .real() or .imag(). + * Fix handmade_aligned_realloc (affected conservativeResize()). + * Fix sparse vector assignment from a sparse matrix. + * Fix log(0) with SSE. + * Fix bug in aligned_free with windows CE. + * Fix traits of Map 12.2. + +------------------------------------------------------------------- +Fri Oct 5 19:13:06 UTC 2012 - asterios.dramis@gmail.com + +- Update to version 3.1.1: + * Relicense to MPL2 + * Add a EIGEN_MPL2_ONLY build option to generate compiler errors when + including non-MPL2 modules + * Remove dynamic allocation for triangular matrix-matrix products of fixed + size objects + * Fix possible underflow issues in SelfAdjointEigenSolver + * Fix issues with fixed-size Diagonal (sub/super diagonal size computation + was wrong) + * Bug 487 - Geometry module: isometry * scaling compilation error + * Bug 486 - MKL support: fixed multiple-references linker errors with various + decompositions + * Bug 480 - work around compilation error on Android NDK due to isfinite + being defined as a macro + * Bug 485 - IterativeLinearSolvers: conflict between a typedef and template + type parameter + * Bug 479 - Eigenvalues/Schur: Adjust max iterations count to matrix size + * Fixed Geometry module compilation under MSVC + * Fixed Sparse module compilation under MSVC 2005 +- Updated the package license to "MPL-2.0 and LGPL-2.1+ and BSD-3-Clause". +- Use pkgconfig(gl) instead of Mesa-libGL-devel as build requirement. +- Added texlive-amsfonts build requirement for openSUSE > 12.2 to fix some + errors during documentation building. + +------------------------------------------------------------------- +Sun Jul 22 12:42:03 UTC 2012 - asterios.dramis@gmail.com + +- Update to version 3.1.0: + * See http://eigen.tuxfamily.org/index.php?title=ChangeLog#Eigen_3.1.0 for + changes. +- Added new build requirements freeglut-devel, glew-devel and for + openSUSE > 12.1 suitesparse-devel. + +------------------------------------------------------------------- +Sun May 20 16:43:44 UTC 2012 - asterios.dramis@gmail.com + +- Added gnu-free-fonts as build requirement in order to fix compilation of the + development documentation. + +------------------------------------------------------------------- +Sun Apr 1 14:54:02 UTC 2012 - asterios.dramis@gmail.com + +- Don't build development documentation for openSUSE 11.4 (fails to build). +- Fix rpmlint warning "zero-length". + +------------------------------------------------------------------- +Tue Mar 27 20:21:10 UTC 2012 - asterios.dramis@gmail.com + +- Initial release (version 3.0.5). diff --git a/eigen3.spec b/eigen3.spec new file mode 100644 index 0000000..a23bafb --- /dev/null +++ b/eigen3.spec @@ -0,0 +1,164 @@ +# +# spec file +# +# Copyright (c) 2022 SUSE LLC +# +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + +# Please submit bugfixes or comments via https://bugs.opensuse.org/ +# + + +%global flavor @BUILD_FLAVOR@%{nil} +%global pkgname eigen3 +%global srcname eigen + +# The OpenGL support test fails +%bcond_with opengl_test + +# Tests fail for different reasons within the test-suite itself; disable for now +# See e.g. https://gitlab.com/libeigen/eigen/-/issues/2088, https://gitlab.com/libeigen/eigen/-/issues/2092 +# Also balloons the resources required: > 32 GiB disk space + >= 12 GiB memory +%bcond_with tests + +%if "%{flavor}" == "docs" +%define pkgsuffix -doc +%endif + +Name: eigen3%{?pkgsuffix} +Version: 3.4.0 +Release: 0 +Summary: C++ Template Library for Linear Algebra +License: BSD-3-Clause AND LGPL-2.1-only AND MPL-2.0 AND LGPL-2.1-or-later +URL: http://eigen.tuxfamily.org/ +Source0: https://gitlab.com/libeigen/eigen/-/archive/%{version}/%{srcname}-%{version}.tar.bz2 +Patch0: 0001-Disable-Altivec-for-ppc64le.patch +Patch1: 0001-Do-stack-allignment-on-ppc.patch +# PATCH-FIX-UPSTREAM -- https://gitlab.com/libeigen/eigen/-/merge_requests/680.patch +Patch2: fix_ppc64le_always_inline_680.patch +%if %{with tests} +# SECTION Patches to fix tests +# PATCH-FIX-UPSTREAM eigen3-googlehash-detection.patch badshah400@gmail.com -- GoogleHash needs C++11 std to compile test code and be succesfully detected +Patch9: eigen3-googlehash-detection.patch +# PATCH-FIX-UPSTREAM eigen3-fix-forward_adolc-unit-test.patch badshah400@gmail -- Prevent conflict of std::min/max with eigen's macros by importing eigen test-suite's main.h header only after all system headers have been included +Patch10: eigen3-fix-forward_adolc-unit-test.patch +# /SECTION +%endif +BuildRequires: adolc-devel +BuildRequires: cmake +BuildRequires: fftw3-devel +BuildRequires: gcc-c++ +BuildRequires: gcc-fortran +BuildRequires: gmp-devel +BuildRequires: gsl-devel +BuildRequires: libboost_headers-devel +BuildRequires: metis-devel +BuildRequires: mpfr-devel +BuildRequires: pkg-config +BuildRequires: sparsehash-devel +BuildRequires: suitesparse-devel +BuildRequires: superlu-devel +%if "%{flavor}" == "docs" +BuildRequires: doxygen +BuildRequires: fdupes +BuildRequires: graphviz +BuildRequires: graphviz-gd +BuildRequires: texlive-dvips +BuildRequires: texlive-latex +BuildRequires: tex(newunicodechar.sty) +%endif +%if %{with opengl_test} +BuildRequires: freeglut-devel +BuildRequires: glew-devel +BuildRequires: pkgconfig(gl) +%endif +BuildArch: noarch + +%description +Eigen is a C++ template library for linear algebra: matrices, vectors, +numerical solvers, and related algorithms. + +%package devel +Summary: C++ Template Library for Linear Algebra +# libeigen3-devel was last used at openSUSE 13.1 (version 3.2.0) +Provides: libeigen3-devel = %{version} +Obsoletes: libeigen3-devel < %{version} + +%description devel +Eigen is a C++ template library for linear algebra: matrices, vectors, +numerical solvers, and related algorithms. + +%if "%{flavor}" == "docs" +Summary: Documentation for the Eigen3 C++ Template Library for Linear Algebra + +%description +Documentation in HTML format for the Eigen3 C++ Template Library +for Linear Algebra +%endif + +%prep +%autosetup -p1 -n %{srcname}-%{version} + +# Fix rpmlint warning "wrong-file-end-of-line-encoding" +sed -i 's/\r$//' COPYING.MINPACK + +# Remove build time references so build-compare can do its work +echo "HTML_TIMESTAMP = NO" >> doc/Doxyfile.in + +%build +%cmake \ + -DINCLUDE_INSTALL_DIR:PATH=include/eigen3 \ + -DCMAKE_SKIP_RPATH:BOOL=OFF \ + -DCMAKE_SKIP_INSTALL_RPATH:BOOL=ON \ + -DEIGEN_TEST_CXX11:Bool=%{?with_tests:ON}%{!?with_tests:OFF} \ + -DEIGEN_TEST_OPENMP:Bool=%{?with_tests:ON}%{!?with_tests:OFF} + +%if "%{flavor}" == "" +%cmake_build all %{?with_tests:buildtests} +%else +%cmake_build doc +%endif + +rm -f doc/html/*.tgz +find doc -name _formulas.log -print -delete + +%install +%if "%{flavor}" == "" +%cmake_install +%else +%fdupes -s build/doc/html/ +%endif + +%if "%{flavor}" == "" +%if %{with tests} +%check +# Run with a fixed seed to prevent random failures: https://gitlab.com/libeigen/eigen/-/issues/2088 +export EIGEN_SEED=100 +# Repeat each test once to reduce time spent, since we use a fixed seed anyway +export EIGEN_REPEAT=1 +%ctest +%endif +%endif + +%if "%{flavor}" == "docs" +%files +%doc build/doc/html/ + +%else + +%files devel +%license COPYING.* +%{_includedir}/eigen3/ +%{_datadir}/eigen3/ +%{_datadir}/pkgconfig/eigen3.pc + +%endif + +%changelog diff --git a/fix_ppc64le_always_inline_680.patch b/fix_ppc64le_always_inline_680.patch new file mode 100644 index 0000000..2dc2df2 --- /dev/null +++ b/fix_ppc64le_always_inline_680.patch @@ -0,0 +1,3138 @@ +From 9e3873b1dce3ba65980c7e7b979325dac2fb4bbd Mon Sep 17 00:00:00 2001 +From: Chip-Kerchner +Date: Wed, 20 Oct 2021 11:06:50 -0500 +Subject: [PATCH 1/2] New branch for inverting rows and depth in non-vectorized + portion of packing. + +--- + Eigen/src/Core/arch/AltiVec/Complex.h | 10 +- + Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 1546 ++++++++--------- + .../Core/arch/AltiVec/MatrixProductCommon.h | 206 +-- + .../src/Core/arch/AltiVec/MatrixProductMMA.h | 335 ++-- + 4 files changed, 927 insertions(+), 1170 deletions(-) + +diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h +index f730ce8d3..4fd923e84 100644 +--- a/Eigen/src/Core/arch/AltiVec/Complex.h ++++ b/Eigen/src/Core/arch/AltiVec/Complex.h +@@ -129,20 +129,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< + template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } + template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } + +-EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex* from0, const std::complex* from1) ++EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std::complex& from1) + { + Packet4f res0, res1; + #ifdef __VSX__ +- __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0)); +- __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1)); ++ __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0)); ++ __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1)); + #ifdef _BIG_ENDIAN + __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); + #else + __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); + #endif + #else +- *reinterpret_cast *>(&res0) = *from0; +- *reinterpret_cast *>(&res1) = *from1; ++ *reinterpret_cast *>(&res0) = from0; ++ *reinterpret_cast *>(&res1) = from1; + res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI); + #endif + return Packet2cf(res0); +diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +index 1d67d60d0..bd5da3623 100644 +--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h ++++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +@@ -166,24 +166,23 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc + + rir += vectorDelta; + } +- if (j < cols) ++ ++ for(; j < cols; j++) + { +- rii = rir + ((cols - j) * rows); ++ rii = rir + rows; + + for(Index i = k2; i < depth; i++) + { +- Index k = j; +- for(; k < cols; k++) +- { +- std::complex v = getAdjointVal(i, k, rhs); ++ std::complex v = getAdjointVal(i, j, rhs); + +- blockBf[rir] = v.real(); +- blockBf[rii] = v.imag(); ++ blockBf[rir] = v.real(); ++ blockBf[rii] = v.imag(); + +- rir += 1; +- rii += 1; +- } ++ rir += 1; ++ rii += 1; + } ++ ++ rir += rows; + } + } + +@@ -262,19 +261,15 @@ EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs + } + } + +- if (j < cols) ++ for(; j < cols; j++) + { + for(Index i = k2; i < depth; i++) + { +- Index k = j; +- for(; k < cols; k++) +- { +- if(k <= i) +- blockB[ri] = rhs(i, k); +- else +- blockB[ri] = rhs(k, i); +- ri += 1; +- } ++ if(j <= i) ++ blockB[ri] = rhs(i, j); ++ else ++ blockB[ri] = rhs(j, i); ++ ri += 1; + } + } + } +@@ -408,22 +403,18 @@ struct symm_pack_lhs + * and offset and behaves accordingly. + **/ + +-template +-EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +-{ +- const Index size = 16 / sizeof(Scalar); +- pstore(to + (0 * size), block.packet[0]); +- pstore(to + (1 * size), block.packet[1]); +- pstore(to + (2 * size), block.packet[2]); +- pstore(to + (3 * size), block.packet[3]); +-} +- +-template +-EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) ++template ++EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) + { + const Index size = 16 / sizeof(Scalar); + pstore(to + (0 * size), block.packet[0]); + pstore(to + (1 * size), block.packet[1]); ++ if (N > 2) { ++ pstore(to + (2 * size), block.packet[2]); ++ } ++ if (N > 3) { ++ pstore(to + (3 * size), block.packet[3]); ++ } + } + + // General template for lhs & rhs complex packing. +@@ -449,9 +440,9 @@ struct dhs_cpack { + PacketBlock cblock; + + if (UseLhs) { +- bload(cblock, lhs, j, i); ++ bload(cblock, lhs, j, i); + } else { +- bload(cblock, lhs, i, j); ++ bload(cblock, lhs, i, j); + } + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); +@@ -478,8 +469,8 @@ struct dhs_cpack { + ptranspose(blocki); + } + +- storeBlock(blockAt + rir, blockr); +- storeBlock(blockAt + rii, blocki); ++ storeBlock(blockAt + rir, blockr); ++ storeBlock(blockAt + rii, blocki); + + rir += 4*vectorSize; + rii += 4*vectorSize; +@@ -499,21 +490,12 @@ struct dhs_cpack { + cblock.packet[1] = lhs.template loadPacket(i, j + 2); + } + } else { +- std::complex lhs0, lhs1; + if (UseLhs) { +- lhs0 = lhs(j + 0, i); +- lhs1 = lhs(j + 1, i); +- cblock.packet[0] = pload2(&lhs0, &lhs1); +- lhs0 = lhs(j + 2, i); +- lhs1 = lhs(j + 3, i); +- cblock.packet[1] = pload2(&lhs0, &lhs1); ++ cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i)); ++ cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i)); + } else { +- lhs0 = lhs(i, j + 0); +- lhs1 = lhs(i, j + 1); +- cblock.packet[0] = pload2(&lhs0, &lhs1); +- lhs0 = lhs(i, j + 2); +- lhs1 = lhs(i, j + 3); +- cblock.packet[1] = pload2(&lhs0, &lhs1); ++ cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1)); ++ cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3)); + } + } + +@@ -535,34 +517,50 @@ struct dhs_cpack { + rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); + } + +- if (j < rows) ++ if (!UseLhs) + { +- if(PanelMode) rir += (offset*(rows - j - vectorSize)); +- rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); ++ if(PanelMode) rir -= (offset*(vectorSize - 1)); + +- for(Index i = 0; i < depth; i++) ++ for(; j < rows; j++) + { +- Index k = j; +- for(; k < rows; k++) ++ rii = rir + ((PanelMode) ? stride : depth); ++ ++ for(Index i = 0; i < depth; i++) + { +- if (UseLhs) { ++ blockAt[rir] = lhs(i, j).real(); ++ ++ if(Conjugate) ++ blockAt[rii] = -lhs(i, j).imag(); ++ else ++ blockAt[rii] = lhs(i, j).imag(); ++ ++ rir += 1; ++ rii += 1; ++ } ++ ++ rir += ((PanelMode) ? (2*stride - depth) : depth); ++ } ++ } else { ++ if (j < rows) ++ { ++ if(PanelMode) rir += (offset*(rows - j - vectorSize)); ++ rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); ++ ++ for(Index i = 0; i < depth; i++) ++ { ++ Index k = j; ++ for(; k < rows; k++) ++ { + blockAt[rir] = lhs(k, i).real(); + + if(Conjugate) + blockAt[rii] = -lhs(k, i).imag(); + else + blockAt[rii] = lhs(k, i).imag(); +- } else { +- blockAt[rir] = lhs(i, k).real(); + +- if(Conjugate) +- blockAt[rii] = -lhs(i, k).imag(); +- else +- blockAt[rii] = lhs(i, k).imag(); ++ rir += 1; ++ rii += 1; + } +- +- rir += 1; +- rii += 1; + } + } + } +@@ -588,16 +586,16 @@ struct dhs_pack{ + PacketBlock block; + + if (UseLhs) { +- bload(block, lhs, j, i); ++ bload(block, lhs, j, i); + } else { +- bload(block, lhs, i, j); ++ bload(block, lhs, i, j); + } + if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) + { + ptranspose(block); + } + +- storeBlock(blockA + ri, block); ++ storeBlock(blockA + ri, block); + + ri += 4*vectorSize; + } +@@ -632,21 +630,33 @@ struct dhs_pack{ + if(PanelMode) ri += vectorSize*(stride - offset - depth); + } + +- if (j < rows) ++ if (!UseLhs) + { +- if(PanelMode) ri += offset*(rows - j); ++ if(PanelMode) ri += offset; + +- for(Index i = 0; i < depth; i++) ++ for(; j < rows; j++) + { +- Index k = j; +- for(; k < rows; k++) ++ for(Index i = 0; i < depth; i++) + { +- if (UseLhs) { ++ blockA[ri] = lhs(i, j); ++ ri += 1; ++ } ++ ++ if(PanelMode) ri += stride - depth; ++ } ++ } else { ++ if (j < rows) ++ { ++ if(PanelMode) ri += offset*(rows - j); ++ ++ for(Index i = 0; i < depth; i++) ++ { ++ Index k = j; ++ for(; k < rows; k++) ++ { + blockA[ri] = lhs(k, i); +- } else { +- blockA[ri] = lhs(i, k); ++ ri += 1; + } +- ri += 1; + } + } + } +@@ -682,7 +692,7 @@ struct dhs_pack(j, i + 1); + } + +- storeBlock(blockA + ri, block); ++ storeBlock(blockA + ri, block); + + ri += 2*vectorSize; + } +@@ -759,7 +769,7 @@ struct dhs_pack(i + 1, j + 0); //[b1 b2] + block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] + +- storeBlock(blockB + ri, block); ++ storeBlock(blockB + ri, block); + } + + ri += 4*vectorSize; +@@ -790,19 +800,17 @@ struct dhs_pack(blockAt + rir, blockr); +- storeBlock(blockAt + rii, blocki); ++ storeBlock(blockAt + rir, blockr); ++ storeBlock(blockAt + rii, blocki); + + rir += 2*vectorSize; + rii += 2*vectorSize; +@@ -943,7 +951,7 @@ struct dhs_cpack cblock; + PacketBlock blockr, blocki; + +- bload(cblock, rhs, i, j); ++ bload(cblock, rhs, i, j); + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); + blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); +@@ -957,8 +965,8 @@ struct dhs_cpack(blockBt + rir, blockr); +- storeBlock(blockBt + rii, blocki); ++ storeBlock(blockBt + rir, blockr); ++ storeBlock(blockBt + rii, blocki); + + rir += 2*vectorSize; + rii += 2*vectorSize; +@@ -967,27 +975,26 @@ struct dhs_cpack +-EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +-{ +- if(NegativeAccumulate) +- { +- acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); +- acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); +- acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); +- acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); +- } else { +- acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); +- acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); +- acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); +- acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); +- } +-} +- +-template +-EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) ++template ++EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) + { + if(NegativeAccumulate) + { + acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); ++ if (N > 1) { ++ acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); ++ } ++ if (N > 2) { ++ acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); ++ } ++ if (N > 3) { ++ acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); ++ } + } else { + acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); ++ if (N > 1) { ++ acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); ++ } ++ if (N > 2) { ++ acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); ++ } ++ if (N > 3) { ++ acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); ++ } + } + } + +@@ -1030,11 +1038,11 @@ EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con + { + Packet lhsV = pload(lhs); + +- pger_common(acc, lhsV, rhsV); ++ pger_common(acc, lhsV, rhsV); + } + +-template +-EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) ++template ++EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV) + { + #ifdef _ARCH_PWR9 + lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); +@@ -1046,32 +1054,32 @@ EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In + #endif + } + +-template +-EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) ++template ++EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) + { + Packet lhsV; +- loadPacketRemaining(lhs, lhsV, remaining_rows); ++ loadPacketRemaining(lhs, lhsV); + +- pger_common(acc, lhsV, rhsV); ++ pger_common(acc, lhsV, rhsV); + } + + // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. + template + EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) + { +- pger_common(accReal, lhsV, rhsV); ++ pger_common(accReal, lhsV, rhsV); + if(LhsIsReal) + { +- pger_common(accImag, lhsV, rhsVi); ++ pger_common(accImag, lhsV, rhsVi); + EIGEN_UNUSED_VARIABLE(lhsVi); + } else { + if (!RhsIsReal) { +- pger_common(accReal, lhsVi, rhsVi); +- pger_common(accImag, lhsV, rhsVi); ++ pger_common(accReal, lhsVi, rhsVi); ++ pger_common(accImag, lhsV, rhsVi); + } else { + EIGEN_UNUSED_VARIABLE(rhsVi); + } +- pger_common(accImag, lhsVi, rhsV); ++ pger_common(accImag, lhsVi, rhsV); + } + } + +@@ -1086,8 +1094,8 @@ EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); + } + +-template +-EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) ++template ++EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi) + { + #ifdef _ARCH_PWR9 + lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); +@@ -1103,11 +1111,11 @@ EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar + #endif + } + +-template +-EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) ++template ++EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) + { + Packet lhsV, lhsVi; +- loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); ++ loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi); + + pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); + } +@@ -1119,132 +1127,142 @@ EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) + } + + // Zero the accumulator on PacketBlock. +-template +-EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +-{ +- acc.packet[0] = pset1((Scalar)0); +- acc.packet[1] = pset1((Scalar)0); +- acc.packet[2] = pset1((Scalar)0); +- acc.packet[3] = pset1((Scalar)0); +-} +- +-template +-EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) ++template ++EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) + { + acc.packet[0] = pset1((Scalar)0); ++ if (N > 1) { ++ acc.packet[1] = pset1((Scalar)0); ++ } ++ if (N > 2) { ++ acc.packet[2] = pset1((Scalar)0); ++ } ++ if (N > 3) { ++ acc.packet[3] = pset1((Scalar)0); ++ } + } + + // Scale the PacketBlock vectors by alpha. +-template +-EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +-{ +- acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); +- acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); +- acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); +- acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); +-} +- +-template +-EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) ++template ++EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) + { + acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); ++ if (N > 1) { ++ acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); ++ } ++ if (N > 2) { ++ acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); ++ } ++ if (N > 3) { ++ acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); ++ } + } + +-template +-EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +-{ +- acc.packet[0] = pmul(accZ.packet[0], pAlpha); +- acc.packet[1] = pmul(accZ.packet[1], pAlpha); +- acc.packet[2] = pmul(accZ.packet[2], pAlpha); +- acc.packet[3] = pmul(accZ.packet[3], pAlpha); +-} +- +-template +-EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) ++template ++EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) + { + acc.packet[0] = pmul(accZ.packet[0], pAlpha); ++ if (N > 1) { ++ acc.packet[1] = pmul(accZ.packet[1], pAlpha); ++ } ++ if (N > 2) { ++ acc.packet[2] = pmul(accZ.packet[2], pAlpha); ++ } ++ if (N > 3) { ++ acc.packet[3] = pmul(accZ.packet[3], pAlpha); ++ } + } + + // Complex version of PacketBlock scaling. + template + EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) + { +- bscalec_common(cReal, aReal, bReal); ++ bscalec_common(cReal, aReal, bReal); + +- bscalec_common(cImag, aImag, bReal); ++ bscalec_common(cImag, aImag, bReal); + +- pger_common(&cReal, bImag, aImag.packet); ++ pger_common(&cReal, bImag, aImag.packet); + +- pger_common(&cImag, bImag, aReal.packet); ++ pger_common(&cImag, bImag, aReal.packet); + } + +-template +-EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) ++template ++EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) + { + acc.packet[0] = pand(acc.packet[0], pMask); +- acc.packet[1] = pand(acc.packet[1], pMask); +- acc.packet[2] = pand(acc.packet[2], pMask); +- acc.packet[3] = pand(acc.packet[3], pMask); ++ if (N > 1) { ++ acc.packet[1] = pand(acc.packet[1], pMask); ++ } ++ if (N > 2) { ++ acc.packet[2] = pand(acc.packet[2], pMask); ++ } ++ if (N > 3) { ++ acc.packet[3] = pand(acc.packet[3], pMask); ++ } + } + +-template +-EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) ++template ++EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) + { +- band(aReal, pMask); +- band(aImag, pMask); ++ band(aReal, pMask); ++ band(aImag, pMask); + +- bscalec(aReal, aImag, bReal, bImag, cReal, cImag); ++ bscalec(aReal, aImag, bReal, bImag, cReal, cImag); + } + + // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. +-template +-EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +-{ +- if (StorageOrder == RowMajor) { +- acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); +- acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); +- acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); +- acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); +- } else { +- acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); +- acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); +- acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); +- acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); +- } +-} +- +-// An overload of bload when you have a PacketBLock with 8 vectors. +-template +-EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) ++template ++EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) + { + if (StorageOrder == RowMajor) { +- acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); +- acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); +- acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); +- acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); +- acc.packet[4] = res.template loadPacket(row + 0, col + (N+1)*accCols); +- acc.packet[5] = res.template loadPacket(row + 1, col + (N+1)*accCols); +- acc.packet[6] = res.template loadPacket(row + 2, col + (N+1)*accCols); +- acc.packet[7] = res.template loadPacket(row + 3, col + (N+1)*accCols); ++ acc.packet[0] = res.template loadPacket(row + 0, col); ++ if (N > 1) { ++ acc.packet[1] = res.template loadPacket(row + 1, col); ++ } ++ if (N > 2) { ++ acc.packet[2] = res.template loadPacket(row + 2, col); ++ } ++ if (N > 3) { ++ acc.packet[3] = res.template loadPacket(row + 3, col); ++ } ++ if (Complex) { ++ acc.packet[0+N] = res.template loadPacket(row + 0, col + accCols); ++ if (N > 1) { ++ acc.packet[1+N] = res.template loadPacket(row + 1, col + accCols); ++ } ++ if (N > 2) { ++ acc.packet[2+N] = res.template loadPacket(row + 2, col + accCols); ++ } ++ if (N > 3) { ++ acc.packet[3+N] = res.template loadPacket(row + 3, col + accCols); ++ } ++ } + } else { +- acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); +- acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); +- acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); +- acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); +- acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); +- acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); +- acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); +- acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); ++ acc.packet[0] = res.template loadPacket(row, col + 0); ++ if (N > 1) { ++ acc.packet[1] = res.template loadPacket(row, col + 1); ++ } ++ if (N > 2) { ++ acc.packet[2] = res.template loadPacket(row, col + 2); ++ } ++ if (N > 3) { ++ acc.packet[3] = res.template loadPacket(row, col + 3); ++ } ++ if (Complex) { ++ acc.packet[0+N] = res.template loadPacket(row + accCols, col + 0); ++ if (N > 1) { ++ acc.packet[1+N] = res.template loadPacket(row + accCols, col + 1); ++ } ++ if (N > 2) { ++ acc.packet[2+N] = res.template loadPacket(row + accCols, col + 2); ++ } ++ if (N > 3) { ++ acc.packet[3+N] = res.template loadPacket(row + accCols, col + 3); ++ } ++ } + } + } + +-template +-EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +-{ +- acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); +- acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); +-} +- + const static Packet4i mask41 = { -1, 0, 0, 0 }; + const static Packet4i mask42 = { -1, -1, 0, 0 }; + const static Packet4i mask43 = { -1, -1, -1, 0 }; +@@ -1275,22 +1293,44 @@ EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) + } + } + +-template +-EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) ++template ++EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) + { +- band(accZ, pMask); ++ band(accZ, pMask); + +- bscale(acc, accZ, pAlpha); ++ bscale(acc, accZ, pAlpha); + } + +-template +-EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) ++template EIGEN_ALWAYS_INLINE void ++pbroadcastN_old(const __UNPACK_TYPE__(Packet) *a, ++ Packet& a0, Packet& a1, Packet& a2, Packet& a3) ++{ ++ a0 = pset1(a[0]); ++ if (N > 1) { ++ a1 = pset1(a[1]); ++ } else { ++ EIGEN_UNUSED_VARIABLE(a1); ++ } ++ if (N > 2) { ++ a2 = pset1(a[2]); ++ } else { ++ EIGEN_UNUSED_VARIABLE(a2); ++ } ++ if (N > 3) { ++ a3 = pset1(a[3]); ++ } else { ++ EIGEN_UNUSED_VARIABLE(a3); ++ } ++} ++ ++template<> ++EIGEN_ALWAYS_INLINE void pbroadcastN_old(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) + { +- pbroadcast4(a, a0, a1, a2, a3); ++ pbroadcast4(a, a0, a1, a2, a3); + } + + template<> +-EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) ++EIGEN_ALWAYS_INLINE void pbroadcastN_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) + { + a1 = pload(a); + a3 = pload(a + 2); +@@ -1300,89 +1340,96 @@ EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0 + a3 = vec_splat(a3, 1); + } + +-// PEEL loop factor. +-#define PEEL 7 +- +-template +-EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( +- const Scalar* &lhs_ptr, +- const Scalar* &rhs_ptr, +- PacketBlock &accZero, +- Index remaining_rows, +- Index remaining_cols) ++template EIGEN_ALWAYS_INLINE void ++pbroadcastN(const __UNPACK_TYPE__(Packet) *a, ++ Packet& a0, Packet& a1, Packet& a2, Packet& a3) + { +- Packet rhsV[1]; +- rhsV[0] = pset1(rhs_ptr[0]); +- pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); +- lhs_ptr += remaining_rows; +- rhs_ptr += remaining_cols; ++ a0 = pset1(a[0]); ++ if (N > 1) { ++ a1 = pset1(a[1]); ++ } else { ++ EIGEN_UNUSED_VARIABLE(a1); ++ } ++ if (N > 2) { ++ a2 = pset1(a[2]); ++ } else { ++ EIGEN_UNUSED_VARIABLE(a2); ++ } ++ if (N > 3) { ++ a3 = pset1(a[3]); ++ } else { ++ EIGEN_UNUSED_VARIABLE(a3); ++ } + } + +-template +-EIGEN_STRONG_INLINE void gemm_extra_col( +- const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, +- Index depth, +- Index strideA, +- Index offsetA, +- Index row, +- Index col, +- Index remaining_rows, +- Index remaining_cols, +- const Packet& pAlpha) ++template<> EIGEN_ALWAYS_INLINE void ++pbroadcastN(const float *a, ++ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) + { +- const Scalar* rhs_ptr = rhs_base; +- const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; +- PacketBlock accZero; ++ a3 = pload(a); ++ a0 = vec_splat(a3, 0); ++ a1 = vec_splat(a3, 1); ++ a2 = vec_splat(a3, 2); ++ a3 = vec_splat(a3, 3); ++} + +- bsetzero(accZero); ++// PEEL loop factor. ++#define PEEL 7 ++#define PEEL_ROW 7 + +- Index remaining_depth = (depth & -accRows); +- Index k = 0; +- for(; k + PEEL <= remaining_depth; k+= PEEL) +- { +- EIGEN_POWER_PREFETCH(rhs_ptr); +- EIGEN_POWER_PREFETCH(lhs_ptr); +- for (int l = 0; l < PEEL; l++) { +- MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); +- } +- } +- for(; k < remaining_depth; k++) +- { +- MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); ++#define MICRO_UNROLL_PEEL(func) \ ++ func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) ++ ++#define MICRO_ZERO_PEEL(peel) \ ++ if ((PEEL_ROW > peel) && (peel != 0)) { \ ++ bsetzero(accZero##peel); \ ++ } else { \ ++ EIGEN_UNUSED_VARIABLE(accZero##peel); \ + } +- for(; k < depth; k++) +- { +- Packet rhsV[1]; +- rhsV[0] = pset1(rhs_ptr[0]); +- pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); +- lhs_ptr += remaining_rows; +- rhs_ptr += remaining_cols; ++ ++#define MICRO_ZERO_PEEL_ROW \ ++ MICRO_UNROLL_PEEL(MICRO_ZERO_PEEL); ++ ++#define MICRO_WORK_PEEL(peel) \ ++ if (PEEL_ROW > peel) { \ ++ pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ ++ pger(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \ ++ } else { \ ++ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +- accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]); +- for(Index i = 0; i < remaining_rows; i++) { +- res(row + i, col) += accZero.packet[0][i]; ++#define MICRO_WORK_PEEL_ROW \ ++ Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \ ++ MICRO_UNROLL_PEEL(MICRO_WORK_PEEL); \ ++ lhs_ptr += (remaining_rows * PEEL_ROW); \ ++ rhs_ptr += (accRows * PEEL_ROW); ++ ++#define MICRO_ADD_PEEL(peel, sum) \ ++ if (PEEL_ROW > peel) { \ ++ for (Index i = 0; i < accRows; i++) { \ ++ accZero##sum.packet[i] += accZero##peel.packet[i]; \ ++ } \ + } +-} + +-template ++#define MICRO_ADD_PEEL_ROW \ ++ MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \ ++ MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0) ++ ++template + EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( + const Scalar* &lhs_ptr, + const Scalar* &rhs_ptr, +- PacketBlock &accZero, +- Index remaining_rows) ++ PacketBlock &accZero) + { + Packet rhsV[4]; +- pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); +- pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); ++ pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); ++ pger(&accZero, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; + } + +-template +-EIGEN_STRONG_INLINE void gemm_extra_row( ++template ++EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, +@@ -1393,59 +1440,89 @@ EIGEN_STRONG_INLINE void gemm_extra_row( + Index col, + Index rows, + Index cols, +- Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) + { + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; +- PacketBlock accZero, acc; ++ PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc; + +- bsetzero(accZero); ++ bsetzero(accZero0); + +- Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); ++ Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); + Index k = 0; +- for(; k + PEEL <= remaining_depth; k+= PEEL) +- { +- EIGEN_POWER_PREFETCH(rhs_ptr); +- EIGEN_POWER_PREFETCH(lhs_ptr); +- for (int l = 0; l < PEEL; l++) { +- MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); +- } ++ if (remaining_depth >= PEEL_ROW) { ++ MICRO_ZERO_PEEL_ROW ++ do ++ { ++ EIGEN_POWER_PREFETCH(rhs_ptr); ++ EIGEN_POWER_PREFETCH(lhs_ptr); ++ MICRO_WORK_PEEL_ROW ++ } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth); ++ MICRO_ADD_PEEL_ROW + } + for(; k < remaining_depth; k++) + { +- MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); ++ MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero0); + } + + if ((remaining_depth == depth) && (rows >= accCols)) + { +- for(Index j = 0; j < 4; j++) { +- acc.packet[j] = res.template loadPacket(row, col + j); +- } +- bscale(acc, accZero, pAlpha, pMask); +- res.template storePacketBlock(row, col, acc); ++ bload(acc, res, row, 0); ++ bscale(acc, accZero0, pAlpha, pMask); ++ res.template storePacketBlock(row, 0, acc); + } else { + for(; k < depth; k++) + { + Packet rhsV[4]; +- pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); +- pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); ++ pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); ++ pger(&accZero0, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; + } + +- for(Index j = 0; j < 4; j++) { +- accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]); +- } +- for(Index j = 0; j < 4; j++) { ++ for(Index j = 0; j < accRows; j++) { ++ accZero0.packet[j] = vec_mul(pAlpha, accZero0.packet[j]); + for(Index i = 0; i < remaining_rows; i++) { +- res(row + i, col + j) += accZero.packet[j][i]; ++ res(row + i, j) += accZero0.packet[j][i]; + } + } + } + } + ++template ++EIGEN_ALWAYS_INLINE void gemm_extra_row( ++ const DataMapper& res, ++ const Scalar* lhs_base, ++ const Scalar* rhs_base, ++ Index depth, ++ Index strideA, ++ Index offsetA, ++ Index row, ++ Index col, ++ Index rows, ++ Index cols, ++ Index remaining_rows, ++ const Packet& pAlpha, ++ const Packet& pMask) ++{ ++ switch(remaining_rows) { ++ case 1: ++ gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); ++ break; ++ case 2: ++ if (sizeof(Scalar) == sizeof(float)) { ++ gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); ++ } ++ break; ++ default: ++ if (sizeof(Scalar) == sizeof(float)) { ++ gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); ++ } ++ break; ++ } ++} ++ + #define MICRO_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +@@ -1464,34 +1541,24 @@ EIGEN_STRONG_INLINE void gemm_extra_row( + + #define MICRO_WORK_ONE(iter, peel) \ + if (unroll_factor > iter) { \ +- pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ ++ pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ + } + + #define MICRO_TYPE_PEEL4(func, func2, peel) \ + if (PEEL > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ +- pbroadcast4(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ +- MICRO_UNROLL_WORK(func, func2, peel) \ +- } else { \ +- EIGEN_UNUSED_VARIABLE(rhsV##peel); \ +- } +- +-#define MICRO_TYPE_PEEL1(func, func2, peel) \ +- if (PEEL > peel) { \ +- Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ +- rhsV##peel[0] = pset1(rhs_ptr[remaining_cols * peel]); \ ++ pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + MICRO_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + + #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ +- Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ ++ Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \ + func(func1,func2,0); func(func1,func2,1); \ + func(func1,func2,2); func(func1,func2,3); \ + func(func1,func2,4); func(func1,func2,5); \ +- func(func1,func2,6); func(func1,func2,7); \ +- func(func1,func2,8); func(func1,func2,9); ++ func(func1,func2,6); func(func1,func2,7); + + #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ + Packet rhsV0[M]; \ +@@ -1505,17 +1572,9 @@ EIGEN_STRONG_INLINE void gemm_extra_row( + MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += accRows; + +-#define MICRO_ONE_PEEL1 \ +- MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ +- rhs_ptr += (remaining_cols * PEEL); +- +-#define MICRO_ONE1 \ +- MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ +- rhs_ptr += remaining_cols; +- + #define MICRO_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ +- bsetzero(accZero##iter); \ ++ bsetzero(accZero##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##iter); \ + } +@@ -1524,7 +1583,7 @@ EIGEN_STRONG_INLINE void gemm_extra_row( + + #define MICRO_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ +- lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ ++ lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ + } +@@ -1540,25 +1599,13 @@ EIGEN_STRONG_INLINE void gemm_extra_row( + + #define MICRO_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ +- acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ +- acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ +- acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ +- acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ +- bscale(acc, accZero##iter, pAlpha); \ +- res.template storePacketBlock(row + iter*accCols, col, acc); \ ++ bload(acc, res, row + iter*accCols, 0); \ ++ bscale(acc, accZero##iter, pAlpha); \ ++ res.template storePacketBlock(row + iter*accCols, 0, acc); \ + } + + #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) + +-#define MICRO_COL_STORE_ONE(iter) \ +- if (unroll_factor > iter) { \ +- acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ +- bscale(acc, accZero##iter, pAlpha); \ +- res.template storePacketBlock(row + iter*accCols, col, acc); \ +- } +- +-#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) +- + template + EIGEN_STRONG_INLINE void gemm_unrolled_iteration( + const DataMapper& res, +@@ -1566,15 +1613,13 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( + const Scalar* rhs_base, + Index depth, + Index strideA, +- Index offsetA, + Index& row, +- Index col, + const Packet& pAlpha) + { + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; +- PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; +- PacketBlock acc; ++ PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; ++ PacketBlock acc; + + MICRO_SRC_PTR + MICRO_DST_PTR +@@ -1595,101 +1640,100 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( + row += unroll_factor*accCols; + } + +-template +-EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( ++template ++EIGEN_ALWAYS_INLINE void gemm_cols( + const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, ++ const Scalar* blockA, ++ const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, +- Index& row, ++ Index strideB, ++ Index offsetB, + Index col, +- Index remaining_cols, +- const Packet& pAlpha) ++ Index rows, ++ Index cols, ++ Index remaining_rows, ++ const Packet& pAlpha, ++ const Packet& pMask) + { +- const Scalar* rhs_ptr = rhs_base; +- const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL; +- PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; +- PacketBlock acc; ++ const DataMapper res3 = res.getSubMapper(0, col); + +- MICRO_SRC_PTR +- MICRO_DST_PTR ++ const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; ++ const Scalar* lhs_base = blockA + accCols*offsetA; ++ Index row = 0; + +- Index k = 0; +- for(; k + PEEL <= depth; k+= PEEL) +- { +- EIGEN_POWER_PREFETCH(rhs_ptr); +- MICRO_PREFETCH +- MICRO_ONE_PEEL1 +- } +- for(; k < depth; k++) +- { +- MICRO_ONE1 +- } +- MICRO_COL_STORE +- +- row += unroll_factor*accCols; +-} +- +-template +-EIGEN_STRONG_INLINE void gemm_unrolled_col( +- const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, +- Index depth, +- Index strideA, +- Index offsetA, +- Index& row, +- Index rows, +- Index col, +- Index remaining_cols, +- const Packet& pAlpha) +-{ + #define MAX_UNROLL 6 + while(row + MAX_UNROLL*accCols <= rows) { +- gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); ++ gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + } + switch( (rows-row)/accCols ) { + #if MAX_UNROLL > 7 + case 7: +- gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); ++ gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; + #endif + #if MAX_UNROLL > 6 + case 6: +- gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); ++ gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; + #endif + #if MAX_UNROLL > 5 +- case 5: +- gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); ++ case 5: ++ gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; + #endif + #if MAX_UNROLL > 4 +- case 4: +- gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); ++ case 4: ++ gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; + #endif + #if MAX_UNROLL > 3 +- case 3: +- gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); +- break; ++ case 3: ++ gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_UNROLL > 2 +- case 2: +- gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); +- break; ++ case 2: ++ gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_UNROLL > 1 +- case 1: +- gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); +- break; ++ case 1: ++ gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif +- default: +- break; ++ default: ++ break; + } + #undef MAX_UNROLL ++ ++ if(remaining_rows > 0) ++ { ++ gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); ++ } ++} ++ ++template ++EIGEN_STRONG_INLINE void gemm_extra_cols( ++ const DataMapper& res, ++ const Scalar* blockA, ++ const Scalar* blockB, ++ Index depth, ++ Index strideA, ++ Index offsetA, ++ Index strideB, ++ Index offsetB, ++ Index col, ++ Index rows, ++ Index cols, ++ Index remaining_rows, ++ const Packet& pAlpha, ++ const Packet& pMask) ++{ ++ for (; col < cols; col++) { ++ gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); ++ } + } + + /**************** +@@ -1699,7 +1743,6 @@ template(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- } +- switch( (rows-row)/accCols ) { +-#if MAX_UNROLL > 7 +- case 7: +- gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; +-#endif +-#if MAX_UNROLL > 6 +- case 6: +- gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; +-#endif +-#if MAX_UNROLL > 5 +- case 5: +- gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; +-#endif +-#if MAX_UNROLL > 4 +- case 4: +- gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; +-#endif +-#if MAX_UNROLL > 3 +- case 3: +- gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; +-#endif +-#if MAX_UNROLL > 2 +- case 2: +- gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; +-#endif +-#if MAX_UNROLL > 1 +- case 1: +- gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; +-#endif +- default: +- break; +- } +-#undef MAX_UNROLL +- +- if(remaining_rows > 0) +- { +- gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); +- } +- } +- +- if(remaining_cols > 0) +- { +- const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; +- const Scalar* lhs_base = blockA; +- +- for(; col < cols; col++) +- { +- Index row = 0; +- +- gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); +- +- if (remaining_rows > 0) +- { +- gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); +- } +- rhs_base++; ++ gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + } +- } ++ ++ gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + } + + #define accColsC (accCols / 2) +@@ -1791,117 +1765,66 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const + + // PEEL_COMPLEX loop factor. + #define PEEL_COMPLEX 3 ++#define PEEL_COMPLEX_ROW 3 + +-template +-EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( +- const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, +- const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, +- PacketBlock &accReal, PacketBlock &accImag, +- Index remaining_rows, +- Index remaining_cols) +-{ +- Packet rhsV[1], rhsVi[1]; +- rhsV[0] = pset1(rhs_ptr_real[0]); +- if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); +- pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); +- lhs_ptr_real += remaining_rows; +- if(!LhsIsReal) lhs_ptr_imag += remaining_rows; +- else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +- rhs_ptr_real += remaining_cols; +- if(!RhsIsReal) rhs_ptr_imag += remaining_cols; +- else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); +-} +- +-template +-EIGEN_STRONG_INLINE void gemm_complex_extra_col( +- const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, +- Index depth, +- Index strideA, +- Index offsetA, +- Index strideB, +- Index row, +- Index col, +- Index remaining_rows, +- Index remaining_cols, +- const Packet& pAlphaReal, +- const Packet& pAlphaImag) +-{ +- const Scalar* rhs_ptr_real = rhs_base; +- const Scalar* rhs_ptr_imag; +- if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB; +- else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); +- const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; +- const Scalar* lhs_ptr_imag; +- if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; +- else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +- PacketBlock accReal, accImag; +- PacketBlock taccReal, taccImag; +- PacketBlock acc0, acc1; +- +- bsetzero(accReal); +- bsetzero(accImag); ++#define MICRO_COMPLEX_UNROLL_PEEL(func) \ ++ func(0) func(1) func(2) func(3) + +- Index remaining_depth = (depth & -accRows); +- Index k = 0; +- for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) +- { +- EIGEN_POWER_PREFETCH(rhs_ptr_real); +- if(!RhsIsReal) { +- EIGEN_POWER_PREFETCH(rhs_ptr_imag); +- } +- EIGEN_POWER_PREFETCH(lhs_ptr_real); +- if(!LhsIsReal) { +- EIGEN_POWER_PREFETCH(lhs_ptr_imag); +- } +- for (int l = 0; l < PEEL_COMPLEX; l++) { +- MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); +- } +- } +- for(; k < remaining_depth; k++) +- { +- MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); ++#define MICRO_COMPLEX_ZERO_PEEL(peel) \ ++ if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \ ++ bsetzero(accReal##peel); \ ++ bsetzero(accImag##peel); \ ++ } else { \ ++ EIGEN_UNUSED_VARIABLE(accReal##peel); \ ++ EIGEN_UNUSED_VARIABLE(accImag##peel); \ + } + +- for(; k < depth; k++) +- { +- Packet rhsV[1], rhsVi[1]; +- rhsV[0] = pset1(rhs_ptr_real[0]); +- if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); +- pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); +- lhs_ptr_real += remaining_rows; +- if(!LhsIsReal) lhs_ptr_imag += remaining_rows; +- rhs_ptr_real += remaining_cols; +- if(!RhsIsReal) rhs_ptr_imag += remaining_cols; ++#define MICRO_COMPLEX_ZERO_PEEL_ROW \ ++ MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_ZERO_PEEL); ++ ++#define MICRO_COMPLEX_WORK_PEEL(peel) \ ++ if (PEEL_COMPLEX_ROW > peel) { \ ++ pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ ++ if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ ++ pgerc(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \ ++ } else { \ ++ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ ++ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +- bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); +- bcouple_common(taccReal, taccImag, acc0, acc1); ++#define MICRO_COMPLEX_WORK_PEEL_ROW \ ++ Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \ ++ Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \ ++ MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_WORK_PEEL); \ ++ lhs_ptr_real += (remaining_rows * PEEL_COMPLEX_ROW); \ ++ if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * PEEL_COMPLEX_ROW); \ ++ else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); \ ++ rhs_ptr_real += (accRows * PEEL_COMPLEX_ROW); \ ++ if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_ROW); \ ++ else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + +- if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) +- { +- res(row + 0, col + 0) += pfirst(acc0.packet[0]); +- } else { +- acc0.packet[0] += res.template loadPacket(row + 0, col + 0); +- res.template storePacketBlock(row + 0, col + 0, acc0); +- if(remaining_rows > accColsC) { +- res(row + accColsC, col + 0) += pfirst(acc1.packet[0]); +- } ++#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \ ++ if (PEEL_COMPLEX_ROW > peel) { \ ++ for (Index i = 0; i < accRows; i++) { \ ++ accReal##sum.packet[i] += accReal##peel.packet[i]; \ ++ accImag##sum.packet[i] += accImag##peel.packet[i]; \ ++ } \ + } +-} + +-template ++#define MICRO_COMPLEX_ADD_PEEL_ROW \ ++ MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \ ++ MICRO_COMPLEX_ADD_PEEL(1, 0) ++ ++template + EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( + const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, + const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, +- PacketBlock &accReal, PacketBlock &accImag, +- Index remaining_rows) ++ PacketBlock &accReal, PacketBlock &accImag) + { + Packet rhsV[4], rhsVi[4]; +- pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); +- if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); +- pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); ++ pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); ++ if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); ++ pgerc(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +@@ -1910,8 +1833,8 @@ EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } + +-template +-EIGEN_STRONG_INLINE void gemm_complex_extra_row( ++template ++EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, +@@ -1923,7 +1846,6 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + Index col, + Index rows, + Index cols, +- Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +@@ -1936,93 +1858,129 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + const Scalar* lhs_ptr_imag; + if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +- PacketBlock accReal, accImag; +- PacketBlock taccReal, taccImag; +- PacketBlock acc0, acc1; +- PacketBlock tRes; ++ PacketBlock accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; ++ PacketBlock taccReal, taccImag; ++ PacketBlock acc0, acc1; ++ PacketBlock tRes; + +- bsetzero(accReal); +- bsetzero(accImag); ++ bsetzero(accReal0); ++ bsetzero(accImag0); + +- Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); ++ Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); + Index k = 0; +- for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) +- { +- EIGEN_POWER_PREFETCH(rhs_ptr_real); +- if(!RhsIsReal) { +- EIGEN_POWER_PREFETCH(rhs_ptr_imag); +- } +- EIGEN_POWER_PREFETCH(lhs_ptr_real); +- if(!LhsIsReal) { +- EIGEN_POWER_PREFETCH(lhs_ptr_imag); +- } +- for (int l = 0; l < PEEL_COMPLEX; l++) { +- MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); +- } ++ if (remaining_depth >= PEEL_COMPLEX_ROW) { ++ MICRO_COMPLEX_ZERO_PEEL_ROW ++ do ++ { ++ EIGEN_POWER_PREFETCH(rhs_ptr_real); ++ if(!RhsIsReal) { ++ EIGEN_POWER_PREFETCH(rhs_ptr_imag); ++ } ++ EIGEN_POWER_PREFETCH(lhs_ptr_real); ++ if(!LhsIsReal) { ++ EIGEN_POWER_PREFETCH(lhs_ptr_imag); ++ } ++ MICRO_COMPLEX_WORK_PEEL_ROW ++ } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth); ++ MICRO_COMPLEX_ADD_PEEL_ROW + } + for(; k < remaining_depth; k++) + { +- MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); ++ MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal0, accImag0); + } + + if ((remaining_depth == depth) && (rows >= accCols)) + { +- bload(tRes, res, row, col); +- bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); +- bcouple(taccReal, taccImag, tRes, acc0, acc1); +- res.template storePacketBlock(row + 0, col, acc0); +- res.template storePacketBlock(row + accColsC, col, acc1); ++ bload(tRes, res, row, 0); ++ bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); ++ bcouple(taccReal, taccImag, tRes, acc0, acc1); ++ res.template storePacketBlock(row + 0, 0, acc0); ++ res.template storePacketBlock(row + accColsC, 0, acc1); + } else { + for(; k < depth; k++) + { + Packet rhsV[4], rhsVi[4]; +- pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); +- if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); +- pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); ++ pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); ++ if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); ++ pgerc(&accReal0, &accImag0, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + rhs_ptr_real += accRows; + if(!RhsIsReal) rhs_ptr_imag += accRows; + } + +- bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); +- bcouple_common(taccReal, taccImag, acc0, acc1); ++ bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag); ++ bcouple_common(taccReal, taccImag, acc0, acc1); + + if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) + { +- for(Index j = 0; j < 4; j++) { +- res(row + 0, col + j) += pfirst(acc0.packet[j]); ++ for(Index j = 0; j < accRows; j++) { ++ res(row + 0, j) += pfirst(acc0.packet[j]); + } + } else { +- for(Index j = 0; j < 4; j++) { ++ for(Index j = 0; j < accRows; j++) { + PacketBlock acc2; +- acc2.packet[0] = res.template loadPacket(row + 0, col + j) + acc0.packet[j]; +- res.template storePacketBlock(row + 0, col + j, acc2); ++ acc2.packet[0] = res.template loadPacket(row + 0, j) + acc0.packet[j]; ++ res.template storePacketBlock(row + 0, j, acc2); + if(remaining_rows > accColsC) { +- res(row + accColsC, col + j) += pfirst(acc1.packet[j]); ++ res(row + accColsC, j) += pfirst(acc1.packet[j]); + } + } + } + } + } + ++template ++EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( ++ const DataMapper& res, ++ const Scalar* lhs_base, ++ const Scalar* rhs_base, ++ Index depth, ++ Index strideA, ++ Index offsetA, ++ Index strideB, ++ Index row, ++ Index col, ++ Index rows, ++ Index cols, ++ Index remaining_rows, ++ const Packet& pAlphaReal, ++ const Packet& pAlphaImag, ++ const Packet& pMask) ++{ ++ switch(remaining_rows) { ++ case 1: ++ gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); ++ break; ++ case 2: ++ if (sizeof(Scalar) == sizeof(float)) { ++ gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); ++ } ++ break; ++ default: ++ if (sizeof(Scalar) == sizeof(float)) { ++ gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); ++ } ++ break; ++ } ++} ++ + #define MICRO_COMPLEX_UNROLL(func) \ +- func(0) func(1) func(2) func(3) func(4) ++ func(0) func(1) func(2) func(3) + + #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ + MICRO_COMPLEX_UNROLL(func2); \ +- func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) ++ func(0,peel) func(1,peel) func(2,peel) func(3,peel) + + #define MICRO_COMPLEX_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ +- lhs_ptr_real##iter += accCols; \ + if(!LhsIsReal) { \ +- lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ +- lhs_ptr_imag##iter += accCols; \ ++ lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } \ ++ lhs_ptr_real##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ +@@ -2030,37 +1988,16 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + + #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ + if (unroll_factor > iter) { \ +- pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ +- } +- +-#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ +- if (unroll_factor > iter) { \ +- pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ ++ pgerc_common(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + } + + #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ + if (PEEL_COMPLEX > peel) { \ +- Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ +- Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ +- pbroadcast4_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ ++ Packet lhsV0, lhsV1, lhsV2, lhsV3; \ ++ Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ ++ pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + if(!RhsIsReal) { \ +- pbroadcast4_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ +- } else { \ +- EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ +- } \ +- MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ +- } else { \ +- EIGEN_UNUSED_VARIABLE(rhsV##peel); \ +- EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ +- } +- +-#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ +- if (PEEL_COMPLEX > peel) { \ +- Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ +- Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ +- rhsV##peel[0] = pset1(rhs_ptr_real[remaining_cols * peel]); \ +- if(!RhsIsReal) { \ +- rhsVi##peel[0] = pset1(rhs_ptr_imag[remaining_cols * peel]); \ ++ pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ +@@ -2071,13 +2008,10 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + } + + #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ +- Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ +- Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ ++ Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \ ++ Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \ + func(func1,func2,0); func(func1,func2,1); \ +- func(func1,func2,2); func(func1,func2,3); \ +- func(func1,func2,4); func(func1,func2,5); \ +- func(func1,func2,6); func(func1,func2,7); \ +- func(func1,func2,8); func(func1,func2,9); ++ func(func1,func2,2); func(func1,func2,3); + + #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ + Packet rhsV0[M], rhsVi0[M];\ +@@ -2093,20 +2027,10 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + rhs_ptr_real += accRows; \ + if(!RhsIsReal) rhs_ptr_imag += accRows; + +-#define MICRO_COMPLEX_ONE_PEEL1 \ +- MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ +- rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ +- if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); +- +-#define MICRO_COMPLEX_ONE1 \ +- MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ +- rhs_ptr_real += remaining_cols; \ +- if(!RhsIsReal) rhs_ptr_imag += remaining_cols; +- + #define MICRO_COMPLEX_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ +- bsetzero(accReal##iter); \ +- bsetzero(accImag##iter); \ ++ bsetzero(accReal##iter); \ ++ bsetzero(accImag##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##iter); \ + EIGEN_UNUSED_VARIABLE(accImag##iter); \ +@@ -2116,15 +2040,9 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + + #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ +- lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ +- if(!LhsIsReal) { \ +- lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ +- } else { \ +- EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ +- } \ ++ lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ +- EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } + + #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) +@@ -2132,35 +2050,21 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + #define MICRO_COMPLEX_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ +- if(!LhsIsReal) { \ +- EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ +- } \ + } + + #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) + + #define MICRO_COMPLEX_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ +- bload(tRes, res, row + iter*accCols, col); \ +- bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ +- bcouple(taccReal, taccImag, tRes, acc0, acc1); \ +- res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ +- res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ ++ bload(tRes, res, row + iter*accCols, 0); \ ++ bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ ++ bcouple(taccReal, taccImag, tRes, acc0, acc1); \ ++ res.template storePacketBlock(row + iter*accCols + 0, 0, acc0); \ ++ res.template storePacketBlock(row + iter*accCols + accColsC, 0, acc1); \ + } + + #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) + +-#define MICRO_COMPLEX_COL_STORE_ONE(iter) \ +- if (unroll_factor > iter) { \ +- bload(tRes, res, row + iter*accCols, col); \ +- bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ +- bcouple(taccReal, taccImag, tRes, acc0, acc1); \ +- res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ +- res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ +- } +- +-#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) +- + template + EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( + const DataMapper& res, +@@ -2168,29 +2072,26 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( + const Scalar* rhs_base, + Index depth, + Index strideA, +- Index offsetA, + Index strideB, + Index& row, +- Index col, + const Packet& pAlphaReal, + const Packet& pAlphaImag) + { + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; ++ const Index imag_delta = accCols*strideA; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } +- const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; +- const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; +- const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; +- PacketBlock accReal0, accImag0, accReal1, accImag1; +- PacketBlock accReal2, accImag2, accReal3, accImag3; +- PacketBlock accReal4, accImag4; +- PacketBlock taccReal, taccImag; +- PacketBlock acc0, acc1; +- PacketBlock tRes; ++ const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; ++ const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; ++ PacketBlock accReal0, accImag0, accReal1, accImag1; ++ PacketBlock accReal2, accImag2, accReal3, accImag3; ++ PacketBlock taccReal, taccImag; ++ PacketBlock acc0, acc1; ++ PacketBlock tRes; + + MICRO_COMPLEX_SRC_PTR + MICRO_COMPLEX_DST_PTR +@@ -2214,112 +2115,93 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( + row += unroll_factor*accCols; + } + +-template +-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( ++template ++EIGEN_ALWAYS_INLINE void gemm_complex_cols( + const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, ++ const Scalar* blockA, ++ const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, +- Index& row, ++ Index offsetB, + Index col, +- Index remaining_cols, ++ Index rows, ++ Index cols, ++ Index remaining_rows, + const Packet& pAlphaReal, +- const Packet& pAlphaImag) ++ const Packet& pAlphaImag, ++ const Packet& pMask) + { +- const Scalar* rhs_ptr_real = rhs_base; +- const Scalar* rhs_ptr_imag; +- if(!RhsIsReal) { +- rhs_ptr_imag = rhs_base + remaining_cols*strideB; +- } else { +- EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); +- } +- const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; +- const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; +- const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; +- PacketBlock accReal0, accImag0, accReal1, accImag1; +- PacketBlock accReal2, accImag2, accReal3, accImag3; +- PacketBlock accReal4, accImag4; +- PacketBlock taccReal, taccImag; +- PacketBlock acc0, acc1; +- PacketBlock tRes; ++ const DataMapper res3 = res.getSubMapper(0, col); + +- MICRO_COMPLEX_SRC_PTR +- MICRO_COMPLEX_DST_PTR ++ const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; ++ const Scalar* lhs_base = blockA + accCols*offsetA; ++ Index row = 0; + +- Index k = 0; +- for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) +- { +- EIGEN_POWER_PREFETCH(rhs_ptr_real); +- if(!RhsIsReal) { +- EIGEN_POWER_PREFETCH(rhs_ptr_imag); +- } +- MICRO_COMPLEX_PREFETCH +- MICRO_COMPLEX_ONE_PEEL1 ++#define MAX_COMPLEX_UNROLL 3 ++ while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { ++ gemm_complex_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + } +- for(; k < depth; k++) +- { +- MICRO_COMPLEX_ONE1 ++ switch( (rows-row)/accCols ) { ++#if MAX_COMPLEX_UNROLL > 4 ++ case 4: ++ gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++#if MAX_COMPLEX_UNROLL > 3 ++ case 3: ++ gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++#if MAX_COMPLEX_UNROLL > 2 ++ case 2: ++ gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++#if MAX_COMPLEX_UNROLL > 1 ++ case 1: ++ gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++ default: ++ break; + } +- MICRO_COMPLEX_COL_STORE ++#undef MAX_COMPLEX_UNROLL + +- row += unroll_factor*accCols; ++ if(remaining_rows > 0) ++ { ++ gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); ++ } + } + + template +-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( ++EIGEN_STRONG_INLINE void gemm_complex_extra_cols( + const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, ++ const Scalar* blockA, ++ const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, +- Index& row, +- Index rows, ++ Index offsetB, + Index col, +- Index remaining_cols, ++ Index rows, ++ Index cols, ++ Index remaining_rows, + const Packet& pAlphaReal, +- const Packet& pAlphaImag) ++ const Packet& pAlphaImag, ++ const Packet& pMask) + { +-#define MAX_COMPLEX_UNROLL 3 +- while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { +- gemm_complex_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); ++ for (; col < cols; col++) { ++ gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } +- switch( (rows-row)/accCols ) { +-#if MAX_COMPLEX_UNROLL > 4 +- case 4: +- gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_UNROLL > 3 +- case 3: +- gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_UNROLL > 2 +- case 2: +- gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_UNROLL > 1 +- case 1: +- gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); +- break; +-#endif +- default: +- break; +- } +-#undef MAX_COMPLEX_UNROLL + } + + template + EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index remaining_rows = rows % accCols; +- const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; +@@ -2334,64 +2216,10 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { +- const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; +- const Scalar* lhs_base = blockA; +- Index row = 0; +- +-#define MAX_COMPLEX_UNROLL 3 +- while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { +- gemm_complex_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- } +- switch( (rows-row)/accCols ) { +-#if MAX_COMPLEX_UNROLL > 4 +- case 4: +- gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_UNROLL > 3 +- case 3: +- gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_UNROLL > 2 +- case 2: +- gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_UNROLL > 1 +- case 1: +- gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +- default: +- break; +- } +-#undef MAX_COMPLEX_UNROLL +- +- if(remaining_rows > 0) +- { +- gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); +- } ++ gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + +- if(remaining_cols > 0) +- { +- const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; +- const Scalar* lhs_base = blockA; +- +- for(; col < cols; col++) +- { +- Index row = 0; +- +- gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); +- +- if (remaining_rows > 0) +- { +- gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); +- } +- rhs_base++; +- } +- } ++ gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + + #undef accColsC +diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +index d4287cc6f..768d9c7c4 100644 +--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h ++++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +@@ -11,22 +11,8 @@ namespace Eigen { + + namespace internal { + +-template +-EIGEN_STRONG_INLINE void gemm_extra_col( +- const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, +- Index depth, +- Index strideA, +- Index offsetA, +- Index row, +- Index col, +- Index remaining_rows, +- Index remaining_cols, +- const Packet& pAlpha); +- + template +-EIGEN_STRONG_INLINE void gemm_extra_row( ++EIGEN_ALWAYS_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, +@@ -41,41 +27,28 @@ EIGEN_STRONG_INLINE void gemm_extra_row( + const Packet& pAlpha, + const Packet& pMask); + +-template +-EIGEN_STRONG_INLINE void gemm_unrolled_col( ++template ++EIGEN_STRONG_INLINE void gemm_extra_cols( + const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, ++ const Scalar* blockA, ++ const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, +- Index& row, +- Index rows, ++ Index strideB, ++ Index offsetB, + Index col, +- Index remaining_cols, +- const Packet& pAlpha); ++ Index rows, ++ Index cols, ++ Index remaining_rows, ++ const Packet& pAlpha, ++ const Packet& pMask); + + template + EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); + + template +-EIGEN_STRONG_INLINE void gemm_complex_extra_col( +- const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, +- Index depth, +- Index strideA, +- Index offsetA, +- Index strideB, +- Index row, +- Index col, +- Index remaining_rows, +- Index remaining_cols, +- const Packet& pAlphaReal, +- const Packet& pAlphaImag); +- +-template +-EIGEN_STRONG_INLINE void gemm_complex_extra_row( ++EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, +@@ -93,123 +66,88 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( + const Packet& pMask); + + template +-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( ++EIGEN_STRONG_INLINE void gemm_complex_extra_cols( + const DataMapper& res, +- const Scalar* lhs_base, +- const Scalar* rhs_base, ++ const Scalar* blockA, ++ const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, +- Index& row, +- Index rows, ++ Index offsetB, + Index col, +- Index remaining_cols, ++ Index rows, ++ Index cols, ++ Index remaining_rows, + const Packet& pAlphaReal, +- const Packet& pAlphaImag); ++ const Packet& pAlphaImag, ++ const Packet& pMask); + + template + EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); + +-template +-EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); ++template ++EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); + +-template +-EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +- +-template +-EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); ++template ++EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); + + template + EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); + +-const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, +- 16, 17, 18, 19, +- 4, 5, 6, 7, +- 20, 21, 22, 23}; +- +-const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, +- 24, 25, 26, 27, +- 12, 13, 14, 15, +- 28, 29, 30, 31}; +-//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 +-const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, +- 16, 17, 18, 19, 20, 21, 22, 23}; +- +-//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 +-const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, +- 24, 25, 26, 27, 28, 29, 30, 31}; +- +- + // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. +-template +-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +-{ +- acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); +- acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); +- acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); +- acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); +- +- acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); +- acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); +- acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); +- acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); +-} +- +-template +-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +-{ +- bcouple_common(taccReal, taccImag, acc1, acc2); +- +- acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); +- acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); +- acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); +- acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); +- +- acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); +- acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); +- acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); +- acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); +-} +- +-template +-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) ++template ++EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) + { +- acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); +- +- acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); ++ acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]); ++ if (N > 1) { ++ acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]); ++ } ++ if (N > 2) { ++ acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]); ++ } ++ if (N > 3) { ++ acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]); ++ } ++ ++ acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]); ++ if (N > 1) { ++ acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]); ++ } ++ if (N > 2) { ++ acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]); ++ } ++ if (N > 3) { ++ acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]); ++ } + } + +-template +-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) ++template ++EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) + { +- bcouple_common(taccReal, taccImag, acc1, acc2); ++ bcouple_common(taccReal, taccImag, acc1, acc2); + + acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); +- +- acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); +-} +- +-template<> +-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +-{ +- acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); +- acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); +- acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); +- acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); +- +- acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); +- acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); +- acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); +- acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); +-} +- +-template<> +-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +-{ +- acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); +- +- acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); ++ if (N > 1) { ++ acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); ++ } ++ if (N > 2) { ++ acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); ++ } ++ if (N > 3) { ++ acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); ++ } ++ ++ acc2.packet[0] = padd(tRes.packet[0+N], acc2.packet[0]); ++ if (N > 1) { ++ acc2.packet[1] = padd(tRes.packet[1+N], acc2.packet[1]); ++ } ++ if (N > 2) { ++ acc2.packet[2] = padd(tRes.packet[2+N], acc2.packet[2]); ++ } ++ if (N > 3) { ++ acc2.packet[3] = padd(tRes.packet[3+N], acc2.packet[3]); ++ } + } + + // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. +diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +index f1f8352c9..e18b7f267 100644 +--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h ++++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +@@ -11,7 +11,7 @@ + #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H + #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H + +-#pragma GCC target("cpu=power10") ++#pragma GCC target("cpu=power10,htm") + + #ifdef __has_builtin + #if !__has_builtin(__builtin_vsx_assemble_pair) +@@ -32,37 +32,37 @@ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) + } + + template +-EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) ++EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc) + { + PacketBlock result; + __builtin_mma_disassemble_acc(&result.packet, acc); + + PacketBlock tRes; +- bload(tRes, data, i, j); ++ bload(tRes, data, i, 0); + +- bscale(tRes, result, alpha); ++ bscale(tRes, result, alpha); + +- data.template storePacketBlock(i, j, tRes); ++ data.template storePacketBlock(i, 0, tRes); + } + +-template +-EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) ++template ++EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) + { + PacketBlock resultReal, resultImag; + __builtin_mma_disassemble_acc(&resultReal.packet, accReal); + __builtin_mma_disassemble_acc(&resultImag.packet, accImag); + + PacketBlock tRes; +- bload(tRes, data, i, j); ++ bload(tRes, data, i, 0); + + PacketBlock taccReal, taccImag; + bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag); + + PacketBlock acc1, acc2; +- bcouple(taccReal, taccImag, tRes, acc1, acc2); ++ bcouple(taccReal, taccImag, tRes, acc1, acc2); + +- data.template storePacketBlock(i + N*accColsC, j, acc1); +- data.template storePacketBlock(i + (N+1)*accColsC, j, acc2); ++ data.template storePacketBlock(i, 0, acc1); ++ data.template storePacketBlock(i + accColsC, 0, acc2); + } + + // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments +@@ -127,7 +127,7 @@ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag + template + EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) + { +- rhsV = ploadRhs((const Scalar*)(rhs)); ++ rhsV = ploadRhs(rhs); + } + + template<> +@@ -186,12 +186,11 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) + } + + #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ +- type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ ++ type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \ + MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \ +- MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \ +- MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9); ++ MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); + + #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \ + type rhsV0; \ +@@ -224,7 +223,7 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) + + #define MICRO_MMA_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ +- lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ ++ lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ + } +@@ -240,21 +239,19 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) + + #define MICRO_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ +- storeAccumulator(row + iter*accCols, col, res, pAlpha, &accZero##iter); \ ++ storeAccumulator(row + iter*accCols, res, pAlpha, &accZero##iter); \ + } + + #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) + + template +-EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( ++EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, +- Index offsetA, + Index& row, +- Index col, + const Packet& pAlpha) + { + const Scalar* rhs_ptr = rhs_base; +@@ -280,94 +277,98 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( + row += unroll_factor*accCols; + } + +-template +-void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) ++template ++EIGEN_ALWAYS_INLINE void gemmMMA_cols( ++ const DataMapper& res, ++ const Scalar* blockA, ++ const Scalar* blockB, ++ Index depth, ++ Index strideA, ++ Index offsetA, ++ Index strideB, ++ Index offsetB, ++ Index col, ++ Index rows, ++ Index cols, ++ Index remaining_rows, ++ const Packet& pAlpha, ++ const Packet& pMask) + { +- const Index remaining_rows = rows % accCols; +- const Index remaining_cols = cols % accRows; +- +- if( strideA == -1 ) strideA = depth; +- if( strideB == -1 ) strideB = depth; +- +- const Packet pAlpha = pset1(alpha); +- const Packet pMask = bmask((const int)(remaining_rows)); ++ const DataMapper res3 = res.getSubMapper(0, col); + +- Index col = 0; +- for(; col + accRows <= cols; col += accRows) +- { +- const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; +- const Scalar* lhs_base = blockA; ++ const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; ++ const Scalar* lhs_base = blockA + accCols*offsetA; ++ Index row = 0; + +- Index row = 0; + #define MAX_MMA_UNROLL 7 +- while(row + MAX_MMA_UNROLL*accCols <= rows) { +- gemm_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- } +- switch( (rows-row)/accCols ) { ++ while(row + MAX_MMA_UNROLL*accCols <= rows) { ++ gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ } ++ switch( (rows-row)/accCols ) { + #if MAX_MMA_UNROLL > 7 +- case 7: +- gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; ++ case 7: ++ gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_MMA_UNROLL > 6 +- case 6: +- gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; ++ case 6: ++ gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_MMA_UNROLL > 5 +- case 5: +- gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; ++ case 5: ++ gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_MMA_UNROLL > 4 +- case 4: +- gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; ++ case 4: ++ gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_MMA_UNROLL > 3 +- case 3: +- gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; ++ case 3: ++ gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_MMA_UNROLL > 2 +- case 2: +- gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; ++ case 2: ++ gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif + #if MAX_MMA_UNROLL > 1 +- case 1: +- gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); +- break; ++ case 1: ++ gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); ++ break; + #endif +- default: +- break; +- } ++ default: ++ break; ++ } + #undef MAX_MMA_UNROLL + +- if(remaining_rows > 0) +- { +- gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); +- } +- } ++ if(remaining_rows > 0) ++ { ++ gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); ++ } ++} + +- if(remaining_cols > 0) +- { +- const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; +- const Scalar* lhs_base = blockA; ++template ++void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) ++{ ++ const Index remaining_rows = rows % accCols; + +- for(; col < cols; col++) +- { +- Index row = 0; ++ if( strideA == -1 ) strideA = depth; ++ if( strideB == -1 ) strideB = depth; + +- gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); ++ const Packet pAlpha = pset1(alpha); ++ const Packet pMask = bmask((const int)(remaining_rows)); + +- if (remaining_rows > 0) +- { +- gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); +- } +- rhs_base++; +- } ++ Index col = 0; ++ for(; col + accRows <= cols; col += accRows) ++ { ++ gemmMMA_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + } ++ ++ gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + } + + #define accColsC (accCols / 2) +@@ -375,21 +376,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, + #define advanceCols ((RhsIsReal) ? 1 : 2) + + // PEEL_COMPLEX_MMA loop factor. +-#define PEEL_COMPLEX_MMA 7 ++#define PEEL_COMPLEX_MMA 3 + + #define MICRO_COMPLEX_MMA_UNROLL(func) \ +- func(0) func(1) func(2) func(3) func(4) ++ func(0) func(1) func(2) func(3) + + #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ +- lhs_ptr_real##iter += accCols; \ + if(!LhsIsReal) { \ +- lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ +- lhs_ptr_imag##iter += accCols; \ ++ lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } \ ++ lhs_ptr_real##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ +@@ -402,8 +402,8 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, + + #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \ + if (PEEL_COMPLEX_MMA > peel) { \ +- Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ +- Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ ++ Packet lhsV0, lhsV1, lhsV2, lhsV3; \ ++ Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ + if(!RhsIsReal) { \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ +@@ -411,20 +411,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ + MICRO_COMPLEX_MMA_UNROLL(func2); \ +- func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \ ++ func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + + #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ +- type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ +- type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \ ++ type rhsV0, rhsV1, rhsV2, rhsV3; \ ++ type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \ +- MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \ +- MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \ +- MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \ +- MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9); ++ MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); + + #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \ + type rhsV0, rhsVi0; \ +@@ -461,15 +458,9 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, + + #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ +- lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ +- if(!LhsIsReal) { \ +- lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ +- } else { \ +- EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ +- } \ ++ lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ +- EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } + + #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE) +@@ -477,45 +468,40 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, + #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ +- if(!LhsIsReal) { \ +- EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ +- } \ + } + + #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE) + + #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ +- storeComplexAccumulator(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ ++ storeComplexAccumulator(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ + } + + #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) + + template +-EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( ++EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, +- Index offsetA, + Index strideB, + Index& row, +- Index col, + const Packet& pAlphaReal, + const Packet& pAlphaImag) + { + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; ++ const Index imag_delta = accCols*strideA; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } +- const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; +- const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; +- const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; +- __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; ++ const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; ++ const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; ++ __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; + + MICRO_COMPLEX_MMA_SRC_PTR + MICRO_COMPLEX_MMA_DST_PTR +@@ -539,11 +525,70 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( + row += unroll_factor*accCols; + } + ++template ++EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( ++ const DataMapper& res, ++ const Scalar* blockA, ++ const Scalar* blockB, ++ Index depth, ++ Index strideA, ++ Index offsetA, ++ Index strideB, ++ Index offsetB, ++ Index col, ++ Index rows, ++ Index cols, ++ Index remaining_rows, ++ const Packet& pAlphaReal, ++ const Packet& pAlphaImag, ++ const Packet& pMask) ++{ ++ const DataMapper res3 = res.getSubMapper(0, col); ++ ++ const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; ++ const Scalar* lhs_base = blockA + accCols*offsetA; ++ Index row = 0; ++ ++#define MAX_COMPLEX_MMA_UNROLL 4 ++ while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { ++ gemm_complex_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ } ++ switch( (rows-row)/accCols ) { ++#if MAX_COMPLEX_MMA_UNROLL > 4 ++ case 4: ++ gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++#if MAX_COMPLEX_MMA_UNROLL > 3 ++ case 3: ++ gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++#if MAX_COMPLEX_MMA_UNROLL > 2 ++ case 2: ++ gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++#if MAX_COMPLEX_MMA_UNROLL > 1 ++ case 1: ++ gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); ++ break; ++#endif ++ default: ++ break; ++ } ++#undef MAX_COMPLEX_MMA_UNROLL ++ ++ if(remaining_rows > 0) ++ { ++ gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); ++ } ++} ++ + template + void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index remaining_rows = rows % accCols; +- const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; +@@ -558,64 +603,10 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { +- const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; +- const Scalar* lhs_base = blockA; +- Index row = 0; +- +-#define MAX_COMPLEX_MMA_UNROLL 4 +- while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { +- gemm_complex_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- } +- switch( (rows-row)/accCols ) { +-#if MAX_COMPLEX_MMA_UNROLL > 4 +- case 4: +- gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_MMA_UNROLL > 3 +- case 3: +- gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_MMA_UNROLL > 2 +- case 2: +- gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +-#if MAX_COMPLEX_MMA_UNROLL > 1 +- case 1: +- gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); +- break; +-#endif +- default: +- break; +- } +-#undef MAX_COMPLEX_MMA_UNROLL +- +- if(remaining_rows > 0) +- { +- gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); +- } ++ gemmMMA_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + +- if(remaining_cols > 0) +- { +- const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; +- const Scalar* lhs_base = blockA; +- +- for(; col < cols; col++) +- { +- Index row = 0; +- +- gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); +- +- if (remaining_rows > 0) +- { +- gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); +- } +- rhs_base++; +- } +- } ++ gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + + #undef accColsC +-- +GitLab + + +From efdb8ac4662beb0c171adec9d36bbb8a6269488b Mon Sep 17 00:00:00 2001 +From: Chip-Kerchner +Date: Tue, 26 Oct 2021 16:42:23 -0500 +Subject: [PATCH 2/2] Fix used uninitialized warnings. + +--- + Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 6 +++--- + Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h | 2 +- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +index bd5da3623..3745a87cb 100644 +--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h ++++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +@@ -1851,11 +1851,11 @@ EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( + const Packet& pMask) + { + const Scalar* rhs_ptr_real = rhs_base; +- const Scalar* rhs_ptr_imag; ++ const Scalar* rhs_ptr_imag = NULL; + if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; +- const Scalar* lhs_ptr_imag; ++ const Scalar* lhs_ptr_imag = NULL; + if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + PacketBlock accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; +@@ -2078,7 +2078,7 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( + const Packet& pAlphaImag) + { + const Scalar* rhs_ptr_real = rhs_base; +- const Scalar* rhs_ptr_imag; ++ const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; +diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +index e18b7f267..9a3132276 100644 +--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h ++++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +@@ -492,7 +492,7 @@ EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( + const Packet& pAlphaImag) + { + const Scalar* rhs_ptr_real = rhs_base; +- const Scalar* rhs_ptr_imag; ++ const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; +-- +GitLab +