From 318f0949c345cbea4743cb3f9af3fd5a13dd402f Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Mon, 23 Nov 2015 12:08:56 +0530 Subject: [PATCH] lapack-test fixes in nrm2 kernels for Cortex A57 --- kernel/arm64/KERNEL.CORTEXA57 | 10 +- kernel/arm64/nrm2.S | 225 +++++++++++++++++++++++ kernel/arm64/znrm2.S | 326 ++++++++++++++++++++-------------- 3 files changed, 423 insertions(+), 138 deletions(-) create mode 100644 kernel/arm64/nrm2.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 0c48b9f2a..7c8eeeea7 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -25,15 +25,15 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S -SDOTKERNEL = dot.S +SDOTKERNEL = dot.S DDOTKERNEL = dot.S CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S -#SNRM2KERNEL = snrm2.S -#DNRM2KERNEL = dnrm2.S -#CNRM2KERNEL = znrm2.S -#ZNRM2KERNEL = znrm2.S +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S SROTKERNEL = rot.S DROTKERNEL = rot.S diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S new file mode 100644 index 000000000..5d06c13c0 --- /dev/null +++ b/kernel/arm64/nrm2.S @@ -0,0 +1,225 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 +#define X x1 +#define INC_X x2 + +#define I x3 + +#if !defined(DOUBLE) +#define SSQ s0 +#define SCALE s1 +#define REGZERO s5 +#define REGONE s6 +#else +#define SSQ d0 +#define SCALE d1 +#define REGZERO d5 +#define REGONE d6 +#endif + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ldr s4, [X], #4 + fcmp s4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_F1_SCALE_GE_X_\@ + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_X_\@: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +#else + ldr d4, [X], #8 + fcmp d4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_F1_SCALE_GE_X_\@ + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_X_\@: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +#endif +KERNEL_F1_NEXT_\@: +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ldr s4, [X] + fcmp s4, REGZERO + beq KERNEL_S1_NEXT + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_S1_SCALE_GE_X + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_S1_NEXT +KERNEL_S1_SCALE_GE_X: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +#else + ldr d4, [X] + fcmp d4, REGZERO + beq KERNEL_S1_NEXT + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_S1_SCALE_GE_X + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_S1_NEXT +KERNEL_S1_SCALE_GE_X: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +#endif +KERNEL_S1_NEXT: + add X, X, INC_X +.endm + +.macro KERNEL_F8 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 // INC_X * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE +#endif +.endm + +.macro INIT + eor v1.16b, v1.16b, v1.16b // scale=0.0 + fmov SSQ, #1.0 + fmov REGONE, SSQ + fmov REGZERO, SCALE +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + + INIT + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #3 // I = N / 8 + cmp I, xzr + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + + INIT_S + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + fmul SSQ, SCALE, SSQ + + ret + + EPILOGUE + diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S index 0c3d264e4..1360dc993 100644 --- a/kernel/arm64/znrm2.S +++ b/kernel/arm64/znrm2.S @@ -28,201 +28,261 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" -#define N x0 /* vector length */ -#define X x1 /* X vector address */ -#define INC_X x2 /* X stride */ -#define I x5 /* loop variable */ +#define N x0 +#define X x1 +#define INC_X x2 -/******************************************************************************* -* Macro definitions -*******************************************************************************/ +#define I x3 #if !defined(DOUBLE) -#define TMPF s6 -#define SSQ s0 -#define TMPVF {v6.s}[0] -#define SZ 4 +#define SSQ s0 +#define SCALE s1 +#define REGZERO s6 +#define REGONE s7 #else -#define TMPF d6 -#define SSQ d0 -#define TMPVF {v6.d}[0] -#define SZ 8 +#define SSQ d0 +#define SCALE d1 +#define REGZERO d6 +#define REGONE d7 #endif -/******************************************************************************/ +/************************************************************************************** +* Macro definitions +**************************************************************************************/ .macro KERNEL_F1 #if !defined(DOUBLE) - ld1 {v1.2s}, [X], #8 - fmul v1.2s, v1.2s, v1.2s - faddp TMPF, v1.2s - fadd SSQ, SSQ, TMPF + ldr s4, [X], #4 + fcmp s4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_F1_SCALE_GE_XR_\@ + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_XR_\@: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +KERNEL_F1_NEXT_\@: + ldr s5, [X], #4 + fcmp s5, REGZERO + beq KERNEL_F1_END_\@ + fabs s5, s5 + fcmp SCALE, s5 + bge KERNEL_F1_SCALE_GE_XI_\@ + fdiv s2, SCALE, s5 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s5 + b KERNEL_F1_END_\@ +KERNEL_F1_SCALE_GE_XI_\@: + fdiv s2, s5, SCALE + fmla SSQ, s2, v2.s[0] #else - ld1 {v1.2d}, [X], #16 - fmul v1.2d, v1.2d, v1.2d - faddp TMPF, v1.2d - fadd SSQ, SSQ, TMPF -#endif -.endm - -.macro KERNEL_F8 -#if !defined(DOUBLE) - ld1 {v1.4s, v2.4s}, [X], #32 - fmla v0.4s, v1.4s, v1.4s - fmla v5.4s, v2.4s, v2.4s - ld1 {v3.4s,v4.4s}, [X], #32 - fmla v0.4s, v3.4s, v3.4s - fmla v5.4s, v4.4s, v4.4s - PRFM PLDL1KEEP, [X, #1024] -#else // DOUBLE - ld1 {v1.2d, v2.2d}, [X], #32 - fmla v0.2d, v1.2d, v1.2d - fmla v5.2d, v2.2d, v2.2d - ld1 {v3.2d, v4.2d}, [X], #32 - fmla v0.2d, v3.2d, v3.2d - fmla v5.2d, v4.2d, v4.2d - - ld1 {v16.2d, v17.2d}, [X], #32 - fmla v0.2d, v16.2d, v16.2d - fmla v5.2d, v17.2d, v17.2d - ld1 {v18.2d, v19.2d}, [X], #32 - fmla v0.2d, v18.2d, v18.2d - fmla v5.2d, v19.2d, v19.2d -#endif -.endm - -.macro nrm2_kernel_F8_FINALIZE -#if !defined(DOUBLE) - fadd v0.4s, v0.4s, v5.4s - ext v1.16b, v0.16b, v0.16b, #8 - fadd v0.2s, v0.2s, v1.2s - faddp SSQ, v0.2s -#else - fadd v0.2d, v0.2d, v5.2d - faddp SSQ, v0.2d -#endif -.endm - -.macro INIT_S -#if !defined(DOUBLE) - lsl INC_X, INC_X, #3 - ld1 {v1.2s}, [X], INC_X - fmul v1.2s, v1.2s, v1.2s - faddp SSQ, v1.2s -#else - lsl INC_X, INC_X, #4 - ld1 {v1.2d}, [X], INC_X - fmul v1.2d, v1.2d, v1.2d - faddp SSQ, v1.2d + ldr d4, [X], #8 + fcmp d4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_F1_SCALE_GE_XR_\@ + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_XR_\@: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +KERNEL_F1_NEXT_\@: + ldr d5, [X], #8 + fcmp d5, REGZERO + beq KERNEL_F1_END_\@ + fabs d5, d5 + fcmp SCALE, d5 + bge KERNEL_F1_SCALE_GE_XI_\@ + fdiv d2, SCALE, d5 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d5 + b KERNEL_F1_END_\@ +KERNEL_F1_SCALE_GE_XI_\@: + fdiv d2, d5, SCALE + fmla SSQ, d2, v2.d[0] #endif +KERNEL_F1_END_\@: .endm .macro KERNEL_S1 #if !defined(DOUBLE) - ld1 {v1.2s}, [X], INC_X - fmul v1.2s, v1.2s, v1.2s - faddp TMPF, v1.2s - fadd SSQ, SSQ, TMPF + ldr s4, [X] + fcmp s4, REGZERO + beq KERNEL_S1_NEXT_\@ + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_S1_SCALE_GE_XR_\@ + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_S1_NEXT_\@ +KERNEL_S1_SCALE_GE_XR_\@: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +KERNEL_S1_NEXT_\@: + ldr s5, [X, #4] + fcmp s5, REGZERO + beq KERNEL_S1_END_\@ + fabs s5, s5 + fcmp SCALE, s5 + bge KERNEL_S1_SCALE_GE_XI_\@ + fdiv s2, SCALE, s5 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s5 + b KERNEL_S1_END_\@ +KERNEL_S1_SCALE_GE_XI_\@: + fdiv s2, s5, SCALE + fmla SSQ, s2, v2.s[0] #else - ld1 {v1.2d}, [X], INC_X - fmul v1.2d, v1.2d, v1.2d - faddp TMPF, v1.2d - fadd SSQ, SSQ, TMPF + ldr d4, [X] + fcmp d4, REGZERO + beq KERNEL_S1_NEXT_\@ + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_S1_SCALE_GE_XR_\@ + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_S1_NEXT_\@ +KERNEL_S1_SCALE_GE_XR_\@: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +KERNEL_S1_NEXT_\@: + ldr d5, [X, #8] + fcmp d5, REGZERO + beq KERNEL_S1_END_\@ + fabs d5, d5 + fcmp SCALE, d5 + bge KERNEL_S1_SCALE_GE_XI_\@ + fdiv d2, SCALE, d5 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d5 + b KERNEL_S1_END_\@ +KERNEL_S1_SCALE_GE_XI_\@: + fdiv d2, d5, SCALE + fmla SSQ, d2, v2.d[0] +#endif +KERNEL_S1_END_\@: + add X, X, INC_X +.endm + +.macro KERNEL_F8 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #4 // INC_X * SIZE #endif .endm -/******************************************************************************* +.macro INIT + eor v1.16b, v1.16b, v1.16b // scale=0.0 + fmov SSQ, #1.0 + fmov REGONE, SSQ + fmov REGZERO, SCALE +.endm + +/************************************************************************************** * End of macro definitions -*******************************************************************************/ +**************************************************************************************/ PROLOGUE -#if !defined(DOUBLE) - fmov SSQ, wzr - fmov s5, SSQ -#else - fmov SSQ, xzr - fmov d5, SSQ -#endif + .align 5 + + INIT + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 - cmp N, xzr - ble nrm2_kernel_zero - cmp INC_X, xzr - ble nrm2_kernel_zero cmp INC_X, #1 bne nrm2_kernel_S_BEGIN nrm2_kernel_F_BEGIN: - asr I, N, #3 + asr I, N, #3 // I = N / 8 cmp I, xzr - beq nrm2_kernel_F1_INIT + ble nrm2_kernel_F1 nrm2_kernel_F8: KERNEL_F8 - subs I, I, #1 - bne nrm2_kernel_F8 - - nrm2_kernel_F8_FINALIZE + subs I, I, #1 + bne nrm2_kernel_F8 nrm2_kernel_F1: ands I, N, #7 ble nrm2_kernel_L999 + nrm2_kernel_F10: KERNEL_F1 - subs I, I, #1 - bne nrm2_kernel_F10 + subs I, I, #1 + bne nrm2_kernel_F10 b nrm2_kernel_L999 -nrm2_kernel_F1_INIT: - - b nrm2_kernel_F1 - nrm2_kernel_S_BEGIN: INIT_S - subs N, N, #1 - ble nrm2_kernel_L999 + mov I, N - asr I, N, #2 - cmp I, xzr - ble nrm2_kernel_S1 - -nrm2_kernel_S4: - - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne nrm2_kernel_S4 - -nrm2_kernel_S1: - - ands I, N, #3 - ble nrm2_kernel_L999 + .align 5 nrm2_kernel_S10: KERNEL_S1 - subs I, I, #1 - bne nrm2_kernel_S10 + subs I, I, #1 + bne nrm2_kernel_S10 + nrm2_kernel_L999: fsqrt SSQ, SSQ - ret + fmul SSQ, SCALE, SSQ -nrm2_kernel_zero: ret EPILOGUE +