2021-02-04 14:40:56 +00:00
|
|
|
From: Michel Normand <normand@linux.vnet.ibm.com>
|
|
|
|
Subject: openblas ppc64be up2 p8
|
|
|
|
Date: Wed, 03 Feb 2021 15:39:25 +0100
|
|
|
|
|
|
|
|
openblas ppc64be up2 p8
|
|
|
|
|
|
|
|
because:
|
|
|
|
* openblas build failed for ppc64 (BE) in openSUSE
|
|
|
|
since version 0.3.12
|
|
|
|
* ppc64 (BE) not supported by IBM after P8.
|
|
|
|
|
|
|
|
Signed-off-by: Michel Normand <normand@linux.vnet.ibm.com>
|
|
|
|
---
|
2021-03-18 14:43:03 +00:00
|
|
|
Makefile.system | 10 +++++++---
|
2021-02-04 14:40:56 +00:00
|
|
|
driver/others/dynamic_power.c | 11 +++++++++++
|
2021-03-18 14:43:03 +00:00
|
|
|
2 files changed, 18 insertions(+), 3 deletions(-)
|
2021-02-04 14:40:56 +00:00
|
|
|
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
Index: OpenBLAS-0.3.14/driver/others/dynamic_power.c
|
2021-02-04 14:40:56 +00:00
|
|
|
===================================================================
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
--- OpenBLAS-0.3.14.orig/driver/others/dynamic_power.c
|
|
|
|
+++ OpenBLAS-0.3.14/driver/others/dynamic_power.c
|
2021-02-04 14:40:56 +00:00
|
|
|
@@ -3,6 +3,7 @@
|
|
|
|
|
|
|
|
extern gotoblas_t gotoblas_POWER6;
|
|
|
|
extern gotoblas_t gotoblas_POWER8;
|
|
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
|
|
|
extern gotoblas_t gotoblas_POWER9;
|
|
|
|
#endif
|
|
|
|
@@ -13,6 +14,7 @@ extern gotoblas_t gotoblas_POWER9;
|
|
|
|
#ifdef HAVE_P10_SUPPORT
|
|
|
|
extern gotoblas_t gotoblas_POWER10;
|
|
|
|
#endif
|
|
|
|
+#endif
|
|
|
|
|
|
|
|
extern void openblas_warning(int verbose, const char *msg);
|
|
|
|
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
@@ -31,12 +33,14 @@ char *gotoblas_corename(void) {
|
2021-02-04 14:40:56 +00:00
|
|
|
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
#endif
|
2021-02-04 14:40:56 +00:00
|
|
|
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
|
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
|
|
|
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_P10_SUPPORT
|
|
|
|
if (gotoblas == &gotoblas_POWER10) return corename[4];
|
|
|
|
#endif
|
|
|
|
+#endif
|
|
|
|
return corename[0];
|
|
|
|
}
|
|
|
|
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
@@ -200,6 +204,10 @@ static gotoblas_t *get_coretype(void) {
|
|
|
|
#endif
|
2021-02-04 14:40:56 +00:00
|
|
|
if (__builtin_cpu_is("power8"))
|
|
|
|
return &gotoblas_POWER8;
|
|
|
|
+ /* Fall back to the POWER8 implementation for big endian */
|
|
|
|
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
|
|
+ return &gotoblas_POWER8;
|
|
|
|
+#else
|
|
|
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
|
|
|
if (__builtin_cpu_is("power9"))
|
|
|
|
return &gotoblas_POWER9;
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
@@ -213,6 +221,7 @@ static gotoblas_t *get_coretype(void) {
|
2021-02-04 14:40:56 +00:00
|
|
|
if (__builtin_cpu_is("power10"))
|
|
|
|
return &gotoblas_POWER9;
|
|
|
|
#endif
|
|
|
|
+#endif
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
@@ -237,12 +246,14 @@ static gotoblas_t *force_coretype(char *
|
2021-02-04 14:40:56 +00:00
|
|
|
case 1: return (&gotoblas_POWER6);
|
- Update to version 0.3.14
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISC V:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with older gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
OBS-URL: https://build.opensuse.org/package/show/science/openblas?expand=0&rev=120
2021-03-18 08:47:05 +00:00
|
|
|
#endif
|
2021-02-04 14:40:56 +00:00
|
|
|
case 2: return (&gotoblas_POWER8);
|
|
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
|
|
|
case 3: return (&gotoblas_POWER9);
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_P10_SUPPORT
|
|
|
|
case 4: return (&gotoblas_POWER10);
|
|
|
|
#endif
|
|
|
|
+#endif
|
|
|
|
default: return NULL;
|
|
|
|
}
|
|
|
|
snprintf(message, 128, "Core not found: %s\n", coretype);
|
2021-03-18 14:43:03 +00:00
|
|
|
Index: OpenBLAS-0.3.14/Makefile.system
|
|
|
|
===================================================================
|
|
|
|
--- OpenBLAS-0.3.14.orig/Makefile.system
|
|
|
|
+++ OpenBLAS-0.3.14/Makefile.system
|
|
|
|
@@ -673,6 +673,9 @@ ifeq ($(ARCH), power)
|
|
|
|
ifneq ($(C_COMPILER), PGI)
|
|
|
|
DYNAMIC_CORE = POWER6
|
|
|
|
DYNAMIC_CORE += POWER8
|
|
|
|
+ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
|
|
|
|
+$(info, OpenBLAS: for big endian limit to POWER8 kernels.)
|
|
|
|
+else
|
|
|
|
ifneq ($(C_COMPILER), GCC)
|
|
|
|
DYNAMIC_CORE += POWER9
|
|
|
|
DYNAMIC_CORE += POWER10
|
|
|
|
@@ -697,11 +700,12 @@ else
|
|
|
|
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
|
|
|
endif
|
|
|
|
endif
|
|
|
|
-else
|
|
|
|
+endif # __ORDER_BIG_ENDIAN__
|
|
|
|
+else # C_COMPILER PGI
|
|
|
|
DYNAMIC_CORE = POWER8
|
|
|
|
DYNAMIC_CORE += POWER9
|
|
|
|
-endif
|
|
|
|
-endif
|
|
|
|
+endif # C_COMPILER PGI
|
|
|
|
+endif # ARCH power
|
|
|
|
|
|
|
|
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
|
|
|
ifndef DYNAMIC_CORE
|