From 6232237dba7bdd7e185216f7bb0d733ba4c0486e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 11 Dec 2020 23:41:17 +0100 Subject: [PATCH 1/8] Make fallback from P10 to P9 conditional on suitable compiler --- driver/others/dynamic_power.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index d60ae68fc..a2f56d839 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -53,8 +53,10 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ +#if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power10")) return &gotoblas_POWER9; +#endif return NULL; } From 77edf82c7faf9af1412b0f0c9de7a7543341b2e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 01:25:20 +0100 Subject: [PATCH 2/8] Update Changelog.txt for 0.3.13 --- Changelog.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index edd3563ec..807c5ff20 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,54 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.13 + 12-Dec-2020 + + common: + * Added a generic bfloat16 SBGEMV kernel + * Fixed a potentially severe memory leak after fork in OpenMP builds + that was introduces in 0.3.12 + * Added detection of the Fujitsu Fortran compiler + * Added detection of the (e)gfortran compiler on OpenBSD + * Added support for overriding the default name of the library independently + from symbol suffixing in the gmake builds (already supported in cmake) + +RISCV: + * Added a RISC V port optimized for C910V + +POWER: + * Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N + * Improved DGEMM performance on POWER10 + * Improved STRSM and DTRSM performance on POWER9 and POWER10 + * Fixed segmemtation faults in DYNAMIC_ARCH builds + * Fixed compilation with the PGI compiler + +x86: + * Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12 + +x86_64: + * Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake + * Improved the performance of SASUM and DASUM kernels through parallelization + * Improved the performance of SROT and DROT kernels + * Improved the performance of multithreaded xSYRK + * Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran + (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or + wrong results) + * Fixed miscompilations by old gcc 4.6 + * Fixed misdetection of AVX2 capability in some Sandybridge cpus + * Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD + +ARM64: + * Fixed segmemtation faults in DYNAMIC_ARCH builds + +MIPS: + * Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA + * Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV + * Added handling of zero increments in the MSA kernels for SSWAP and DSWAP + * Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only) + +SPARC: + * Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers + ==================================================================== Version 0.3.12 24-Oct-2020 From 3dec81200cdac01651681a3e36f77179a0815eb4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 14:27:37 +0100 Subject: [PATCH 3/8] Update Changelog.txt Co-authored-by: h-vetinari --- Changelog.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 807c5ff20..cbc7007ac 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -6,7 +6,7 @@ Version 0.3.13 common: * Added a generic bfloat16 SBGEMV kernel * Fixed a potentially severe memory leak after fork in OpenMP builds - that was introduces in 0.3.12 + that was introduced in 0.3.12 * Added detection of the Fujitsu Fortran compiler * Added detection of the (e)gfortran compiler on OpenBSD * Added support for overriding the default name of the library independently From d3ec787f774bc678ec13f0ed87fe2f3d67af1a11 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 18:14:49 +0100 Subject: [PATCH 4/8] Update version to 0.3.13 for release --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 1a0965d08..e4b82104e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.12.dev +VERSION = 0.3.13 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 7bc0e4a2e001117d7e51f0ef8ea1abc4b734d079 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 18:15:33 +0100 Subject: [PATCH 5/8] Update version to 0.3.13 for release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aeb4399e4..12730e0e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 12.dev) +set(OpenBLAS_PATCH_VERSION 13) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 9031ebd7d50d903ad2372001f4d20908f0c0bf20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 23:28:20 +0100 Subject: [PATCH 6/8] Update version to 0.3.13.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12730e0e3..c5ba3ceed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 13) +set(OpenBLAS_PATCH_VERSION 13.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 87315e8a8d1f27684d886c31742d95d98886aa8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 23:28:49 +0100 Subject: [PATCH 7/8] Update version to 0.3.13.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index e4b82104e..c68c20923 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.13 +VERSION = 0.3.13.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 2fb11f873bfb5d690cbe096d81a837ede4cfa63f Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sun, 13 Dec 2020 10:41:45 -0600 Subject: [PATCH 8/8] POWER10: Improve copy performance This patch aligns the stores to 32 byte boundary for scopy and dcopy before entering into vector pair loop. For ccopy, changed the store instructions to stxv to improve performance of unaligned cases. --- kernel/power/ccopy_microk_power10.c | 115 ++++++++++++++++++++++++++++ kernel/power/ccopy_power10.c | 4 +- kernel/power/copy_microk_power10.c | 25 +++--- kernel/power/dcopy_power10.c | 16 ++-- kernel/power/scopy_power10.c | 15 +++- 5 files changed, 152 insertions(+), 23 deletions(-) create mode 100644 kernel/power/ccopy_microk_power10.c diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c new file mode 100644 index 000000000..6c80f9cd4 --- /dev/null +++ b/kernel/power/ccopy_microk_power10.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL 1 + +static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "addi %2, %2, 256 \n\t" + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "addi %3, %3, 256 \n\t" + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c index a5877cd12..41c510460 100644 --- a/kernel/power/ccopy_power10.c +++ b/kernel/power/ccopy_power10.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(__VEC__) || defined(__ALTIVEC__) -#include "copy_microk_power10.c" +#include "ccopy_microk_power10.c" #endif #ifndef HAVE_KERNEL @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -32; if ( n1 > 0 ) { copy_kernel(n1, x, y); diff --git a/kernel/power/copy_microk_power10.c b/kernel/power/copy_microk_power10.c index c90dc3785..8bca1a1e7 100644 --- a/kernel/power/copy_microk_power10.c +++ b/kernel/power/copy_microk_power10.c @@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "one%=: \n\t" "stxvp 32, 0(%3) \n\t" - "lxvp 32, 0(%2) \n\t" "stxvp 34, 32(%3) \n\t" - "lxvp 34, 32(%2) \n\t" "stxvp 36, 64(%3) \n\t" - "lxvp 36, 64(%2) \n\t" "stxvp 38, 96(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" "stxvp 40, 128(%3) \n\t" - "lxvp 40, 128(%2) \n\t" "stxvp 42, 160(%3) \n\t" - "lxvp 42, 160(%2) \n\t" "stxvp 44, 192(%3) \n\t" - "lxvp 44, 192(%2) \n\t" "stxvp 46, 224(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" "lxvp 46, 224(%2) \n\t" "stxvp 48, 256(%3) \n\t" - "lxvp 48, 256(%2) \n\t" "stxvp 50, 288(%3) \n\t" - "lxvp 50, 288(%2) \n\t" "stxvp 52, 320(%3) \n\t" - "lxvp 52, 320(%2) \n\t" "stxvp 54, 352(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" - "lxvp 56, 384(%2) \n\t" "stxvp 58, 416(%3) \n\t" - "lxvp 58, 416(%2) \n\t" "stxvp 60, 448(%3) \n\t" - "lxvp 60, 448(%2) \n\t" "stxvp 62, 480(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" "lxvp 62, 480(%2) \n\t" "addi %3, %3, 512 \n\t" diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c index cd10b7136..6c5eb4d77 100644 --- a/kernel/power/dcopy_power10.c +++ b/kernel/power/dcopy_power10.c @@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - - BLASLONG n1 = n & -64; - if ( n1 > 0 ) + if ( n >= 64 ) { - copy_kernel(n1, x, y); - i=n1; + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 ) + { + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n) diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c index 298a8998a..3398ce827 100644 --- a/kernel/power/scopy_power10.c +++ b/kernel/power/scopy_power10.c @@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -128; - if ( n1 > 0 ) + if ( n >= 128 ) { - copy_kernel (n1, x, y); - i=n1; + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -128; + if ( n1 ) + { + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n)