From 2610752dbbcfddd0834d7ae2b9ec80862546199d Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Tue, 6 Oct 2015 11:41:15 +0530 Subject: [PATCH] Optimized iamax kernels for CORTEXA57 Co-Authored-By: Ralph Campbell --- kernel/arm64/KERNEL.CORTEXA57 | 6 + kernel/arm64/idamax.S | 124 ++++++++++++++++++++ kernel/arm64/isamax.S | 213 ++++++++++++++++++++++++++++++++++ kernel/arm64/izamax.S | 151 ++++++++++++++++++++++++ 4 files changed, 494 insertions(+) create mode 100644 kernel/arm64/idamax.S create mode 100644 kernel/arm64/isamax.S create mode 100644 kernel/arm64/izamax.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index c23ce11fe..174e5d2cd 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -4,3 +4,9 @@ SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S + +ISAMAXKERNEL = isamax.S +IDAMAXKERNEL = idamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + diff --git a/kernel/arm64/idamax.S b/kernel/arm64/idamax.S new file mode 100644 index 000000000..fd4265899 --- /dev/null +++ b/kernel/arm64/idamax.S @@ -0,0 +1,124 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro INIT_S + lsl INC_X, INC_X, #3 + ld1 {v0.d}[0], [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs MAXF, MAXF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + add Z, Z, #1 + fabs TMPF, TMPF + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + + mov x0, INDEX + ret + +iamax_kernel_zero: + + mov x0, xzr + ret + + EPILOGUE diff --git a/kernel/arm64/isamax.S b/kernel/arm64/isamax.S new file mode 100644 index 000000000..309b1c1a4 --- /dev/null +++ b/kernel/arm64/isamax.S @@ -0,0 +1,213 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ +#define X_COPY x6 /* copy of X address */ +#define MAXF_Z x7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define MAXF s5 +#define TMPF s6 +#define TMPVF {v6.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro INIT_F1 + ldr MAXF, [X], #SZ + mov Z, #1 + mov INDEX, Z + fabs MAXF, MAXF +.endm + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + add Z, Z, #1 + fabs TMPF, TMPF + fcmp TMPF, MAXF + fcsel MAXF, MAXF, TMPF, le + csel INDEX, INDEX, Z, le +.endm + +.macro INIT_F4 + ld1 {v0.4s}, [X], #16 + fabs v0.4s, v0.4s + fmaxv MAXF, v0.4s + mov Z, #5 + mov MAXF_Z, #1 +.endm + +.macro KERNEL_F4 + ld1 {v0.4s}, [X], #16 + fabs v0.4s, v0.4s + fmaxv TMPF, v0.4s + PRFM PLDL1KEEP, [X, #512] + fcmp TMPF, MAXF + fcsel MAXF, MAXF, TMPF, le + csel MAXF_Z, MAXF_Z, Z, le + add Z, Z, #4 +.endm + + +.macro KERNEL_F4_FINALIZE + mov INDEX, MAXF_Z + sub MAXF_Z, MAXF_Z, #1 + lsl MAXF_Z, MAXF_Z, #2 + add X_COPY, X_COPY, MAXF_Z + ldr TMPF, [X_COPY], #SZ + fabs TMPF, TMPF + fcmp TMPF, MAXF + beq KERNEL_F4_FINALIZE_DONE + add INDEX, INDEX, #1 + ldr TMPF, [X_COPY], #SZ + fabs TMPF, TMPF + fcmp TMPF, MAXF + beq KERNEL_F4_FINALIZE_DONE + add INDEX, INDEX, #1 + ldr TMPF, [X_COPY], #SZ + fabs TMPF, TMPF + fcmp TMPF, MAXF + beq KERNEL_F4_FINALIZE_DONE + add INDEX, INDEX, #1 +KERNEL_F4_FINALIZE_DONE: +.endm + + +.macro INIT_S + lsl INC_X, INC_X, #2 + ld1 TMPVF, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs MAXF, TMPF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + add Z, Z, #1 + fabs TMPF, TMPF + fcmp TMPF, MAXF + fcsel MAXF, MAXF, TMPF, le + csel INDEX, INDEX, Z, le +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + PRFM PLDL1KEEP, [X] + mov X_COPY, X + + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + +iamax_kernel_F_BEGIN: + asr I, N, #2 + cmp I, xzr + beq iamax_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq iamax_kernel_F4_FINALIZE + +iamax_kernel_F4: + KERNEL_F4 + subs I, I, #1 + bne iamax_kernel_F4 + +iamax_kernel_F4_FINALIZE: + KERNEL_F4_FINALIZE + +iamax_kernel_F1: + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_F10: + KERNEL_F1 + subs I, I, #1 + bne iamax_kernel_F10 + b iamax_kernel_L999 + +iamax_kernel_F1_INIT: + INIT_F1 + subs N, N, #1 + b iamax_kernel_F1 + +iamax_kernel_S_BEGIN: + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + KERNEL_S1 + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + mov x0, INDEX + ret + +iamax_kernel_zero: + mov x0, xzr + ret + + EPILOGUE diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S new file mode 100644 index 000000000..ebdc671e0 --- /dev/null +++ b/kernel/arm64/izamax.S @@ -0,0 +1,151 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + ld1 {v0.2s}, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs v0.2s, v0.2s + ext v1.8b, v0.8b, v0.8b, #4 + fadd MAXF, s0, s1 +#else + lsl INC_X, INC_X, #4 + ld1 {v0.2d}, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs v0.2d, v0.2d + faddp MAXF, v0.2d +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], INC_X + add Z, Z, #1 + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, s1, s2 +#else + ld1 {v1.2d}, [X], INC_X + add Z, Z, #1 + fabs v1.2d, v1.2d + faddp TMPF, v1.2d +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + + mov x0, INDEX + ret + +iamax_kernel_zero: + + mov x0, xzr + ret + + EPILOGUE