From 9a38592c79ee4e4b3a38e18092e880e4e92481c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Dec 2020 21:55:08 +0100 Subject: [PATCH 1/5] Add pointers to the netlib documentation and Gilbert Strang's linear algebra primers --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6c6322c32..fed3936ee 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,14 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta ## Introduction -OpenBLAS is an optimized Basic Linear Algebra Subprograms library based on GotoBLAS2 1.13 BSD version. +OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. Please read the documentation on the OpenBLAS wiki pages: . +For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: +. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six +20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare or Youtube may be helpful. + ## Binary Packages We provide official binary packages for the following platform: From 0a535e58d857cb3b6d2cd73db7b4197c64c82836 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Tue, 29 Dec 2020 12:06:39 +0000 Subject: [PATCH 2/5] getarch.c: define OPENBLAS_SUPPORTED for riscv64 --- getarch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/getarch.c b/getarch.c index 29671736e..f48944f36 100644 --- a/getarch.c +++ b/getarch.c @@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef __riscv #include "cpuid_riscv64.c" +#define OPENBLAS_SUPPORTED #endif #ifdef __arm__ From 1b2508362b9033468eb98ea4146e31ab50d14fa3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 1 Jan 2021 02:09:40 -0800 Subject: [PATCH 3/5] arm64: Fix nrm2 for input vectors with Inf Fix double precision nrm2 kernels returning NaN when the input vectors contain Inf/-Inf. --- kernel/arm64/KERNEL.NEOVERSEN1 | 8 ++++---- kernel/arm64/KERNEL.THUNDERX2T99 | 8 ++++---- kernel/arm64/KERNEL.THUNDERX3T110 | 17 +++++++---------- kernel/arm64/dznrm2_thunderx2t99.c | 28 +++++++++++++++++++++++++++- 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index 074d72153..ea010db42 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index 8333f60e6..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -153,12 +153,12 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = nrm2.S -CNRM2KERNEL = nrm2.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c DDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index 4cdd8769f..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -153,16 +153,13 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -#SNRM2KERNEL = scnrm2_thunderx2t99.c -#CNRM2KERNEL = scnrm2_thunderx2t99.c -##DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -##ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#DNRM2KERNEL = dznrm2_thunderx2t99.c -#ZNRM2KERNEL = dznrm2_thunderx2t99.c -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index b94f0cffc..b021a2832 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n #define CUR_MAXINV "d8" #define CUR_MAXINV_V "v8.2d" #define CUR_MAX_V "v8.2d" +#define REGINF "d9" static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, double *ssq, double *scale) @@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ble 9f //nrm2_kernel_L999 \n" "1: //nrm2_kernel_F_BEGIN: \n" + " mov x6, #0x7FF0000000000000 //+Infinity \n" " fmov "REGZERO", xzr \n" " fmov "REGONE", #1.0 \n" + " fmov "REGINF", x6 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " mov "J", "N" \n" " cmp "J", xzr \n" @@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, "9: //nrm2_kernel_L999: \n" " str "SSQ", [%[SSQ_]] \n" " str "SCALE", [%[SCALE_]] \n" + " b 11f \n" + "10: \n" + " str "REGINF", [%[SSQ_]] \n" + " str "REGINF", [%[SCALE_]] \n" + "11: \n" : : [SSQ_] "r" (ssq), //%0 @@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, [INCX_] "r" (inc_x) //%4 : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" ); @@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) cur_ssq = *ptr; cur_scale = *(ptr + 1); + if (cur_ssq == INFINITY) { + ssq = INFINITY; + scale = INFINITY; + break; + } + if (cur_scale != 0) { if (cur_scale > scale) { scale = (scale / cur_scale); From 7aa1ff8ff6d3f151292eeb86c629e4077b867ae0 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 1 Jan 2021 21:19:57 +0000 Subject: [PATCH 4/5] Fix build on FreeBSD/powerpc64le --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index ce3a819a8..ca0879fe6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power +else ifeq ($(ARCH), powerpc64le) +override ARCH=power else ifeq ($(ARCH), powerpc) override ARCH=power else ifeq ($(ARCH), i386) From 601b711c78a4a652820edacc16c6791a7f120c7d Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 8 Jan 2021 08:01:36 -0600 Subject: [PATCH 5/5] Optimize swap function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/cswap.c | 4 +- kernel/power/cswap_microk_power10.c | 127 ++++++++++++++++++++++++++++ kernel/power/dswap.c | 22 ++++- kernel/power/sswap.c | 22 ++++- kernel/power/swap_microk_power10.c | 105 +++++++++++++++++++++++ kernel/power/zswap.c | 4 +- 6 files changed, 280 insertions(+), 4 deletions(-) create mode 100644 kernel/power/cswap_microk_power10.c create mode 100644 kernel/power/swap_microk_power10.c diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 5144a2e93..4d9b9ccd6 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c new file mode 100644 index 000000000..2a44a9e30 --- /dev/null +++ b/kernel/power/cswap_microk_power10.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(DOUBLE) +#define HAVE_KERNEL_16 1 +static void zswap_kernel_16 (long n, double *x, double *y) +#else +#define HAVE_KERNEL_32 1 +static void cswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 41, 0(%3) \n\t" + "stxv 40, 16(%3) \n\t" + "stxv 43, 32(%3) \n\t" + "stxv 42, 48(%3) \n\t" + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 53, 64(%4) \n\t" + "stxv 52, 80(%4) \n\t" + "stxv 55, 96(%4) \n\t" + "stxv 54, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + + "stxv 57, 0(%4) \n\t" + "stxv 56, 16(%4) \n\t" + "stxv 59, 32(%4) \n\t" + "stxv 58, 48(%4) \n\t" + "stxv 61, 64(%4) \n\t" + "stxv 60, 80(%4) \n\t" + "stxv 63, 96(%4) \n\t" + "stxv 62, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -16 \n\t" +#else + "addic. %2, %2, -32 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index ff3f95c79..9e6229c6a 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -32; + if ( n1 > 0 ) + { + dswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { dswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 44522f0a0..dd249fd36 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 > 0 ) + { + sswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c new file mode 100644 index 000000000..f9c1fee52 --- /dev/null +++ b/kernel/power/swap_microk_power10.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define HAVE_KERNEL_32 1 + +#if defined(DOUBLE) +static void dswap_kernel_32 (long n, double *x, double *y) +#else +static void sswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 52, 64(%4) \n\t" + "stxvp 54, 96(%4) \n\t" + + "stxvp 56, 128(%4) \n\t" + "stxvp 58, 160(%4) \n\t" + "stxvp 60, 192(%4) \n\t" + "stxvp 62, 224(%4) \n\t" + + "addi %4, %4, 256 \n\t" + "addi %3, %3, 256 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -32 \n\t" +#else + "addic. %2, %2, -64 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 3a5a8eb83..6cd3d9664 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif