lapack-test fixes for Cortex A57
This commit is contained in:
parent
39937d15cd
commit
98965da2e8
|
@ -5,8 +5,8 @@ DAMAXKERNEL = amax.S
|
|||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
ISAMAXKERNEL = isamax.S
|
||||
IDAMAXKERNEL = idamax.S
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
|
@ -25,22 +25,22 @@ DCOPYKERNEL = copy.S
|
|||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
DOTKERNEL = dot.S
|
||||
SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
|
||||
SNRM2KERNEL = snrm2.S
|
||||
DNRM2KERNEL = dnrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
#SNRM2KERNEL = snrm2.S
|
||||
#DNRM2KERNEL = dnrm2.S
|
||||
#CNRM2KERNEL = znrm2.S
|
||||
#ZNRM2KERNEL = znrm2.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SCALKERNEL = scal.S
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
|
|
@ -181,73 +181,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v17.4s, v17.4s
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v21.4s, v21.4s
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v25.4s, v25.4s
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v29.4s, v29.4s
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
|
||||
fmul v18.4s, v2.4s, v8.4s[0]
|
||||
OP_ii v18.4s, v3.4s, v9.4s[0]
|
||||
fmul v19.4s, v2.4s, v9.4s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v19.4s, v19.4s
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.4s, v2.4s, v9.4s[0]
|
||||
#else
|
||||
fmul v19.4s, v2.4s, v9.4s[0]
|
||||
#endif
|
||||
OP_ir v19.4s, v3.4s, v8.4s[0]
|
||||
|
||||
fmul v22.4s, v2.4s, v8.4s[1]
|
||||
OP_ii v22.4s, v3.4s, v9.4s[1]
|
||||
fmul v23.4s, v2.4s, v9.4s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v23.4s, v23.4s
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.4s, v2.4s, v9.4s[1]
|
||||
#else
|
||||
fmul v23.4s, v2.4s, v9.4s[1]
|
||||
#endif
|
||||
OP_ir v23.4s, v3.4s, v8.4s[1]
|
||||
|
||||
fmul v26.4s, v2.4s, v8.4s[2]
|
||||
OP_ii v26.4s, v3.4s, v9.4s[2]
|
||||
fmul v27.4s, v2.4s, v9.4s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v27.4s, v27.4s
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.4s, v2.4s, v9.4s[2]
|
||||
#else
|
||||
fmul v27.4s, v2.4s, v9.4s[2]
|
||||
#endif
|
||||
OP_ir v27.4s, v3.4s, v8.4s[2]
|
||||
|
||||
fmul v30.4s, v2.4s, v8.4s[3]
|
||||
OP_ii v30.4s, v3.4s, v9.4s[3]
|
||||
fmul v31.4s, v2.4s, v9.4s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v31.4s, v31.4s
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.4s, v2.4s, v9.4s[3]
|
||||
#else
|
||||
fmul v31.4s, v2.4s, v9.4s[3]
|
||||
#endif
|
||||
OP_ir v31.4s, v3.4s, v8.4s[3]
|
||||
|
||||
|
|
|
@ -172,37 +172,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fmul v16.4s, v0.4s, v8.4s[0]
|
||||
OP_ii v16.4s, v1.4s, v9.4s[0]
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v17.4s, v17.4s
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.4s, v0.4s, v9.4s[0]
|
||||
#else
|
||||
fmul v17.4s, v0.4s, v9.4s[0]
|
||||
#endif
|
||||
OP_ir v17.4s, v1.4s, v8.4s[0]
|
||||
|
||||
fmul v20.4s, v0.4s, v8.4s[1]
|
||||
OP_ii v20.4s, v1.4s, v9.4s[1]
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v21.4s, v21.4s
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.4s, v0.4s, v9.4s[1]
|
||||
#else
|
||||
fmul v21.4s, v0.4s, v9.4s[1]
|
||||
#endif
|
||||
OP_ir v21.4s, v1.4s, v8.4s[1]
|
||||
|
||||
fmul v24.4s, v0.4s, v8.4s[2]
|
||||
OP_ii v24.4s, v1.4s, v9.4s[2]
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v25.4s, v25.4s
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.4s, v0.4s, v9.4s[2]
|
||||
#else
|
||||
fmul v25.4s, v0.4s, v9.4s[2]
|
||||
#endif
|
||||
OP_ir v25.4s, v1.4s, v8.4s[2]
|
||||
|
||||
fmul v28.4s, v0.4s, v8.4s[3]
|
||||
OP_ii v28.4s, v1.4s, v9.4s[3]
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v29.4s, v29.4s
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.4s, v0.4s, v9.4s[3]
|
||||
#else
|
||||
fmul v29.4s, v0.4s, v9.4s[3]
|
||||
#endif
|
||||
OP_ir v29.4s, v1.4s, v8.4s[3]
|
||||
|
||||
|
|
|
@ -45,16 +45,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define COND ge
|
||||
#endif
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define MAXF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define MAXF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
ld1 {v0.s}[0], [X], INC_X
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
ld1 {v0.d}[0], [X], INC_X
|
||||
#endif
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs MAXF, MAXF
|
||||
|
@ -107,9 +119,8 @@ iamax_kernel_S1:
|
|||
iamax_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S10
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S10
|
||||
|
||||
iamax_kernel_L999:
|
||||
|
|
@ -1,213 +0,0 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define INDEX x3 /* index of max/min value */
|
||||
#define Z x4 /* vector index */
|
||||
#define I x5 /* loop variable */
|
||||
#define X_COPY x6 /* copy of X address */
|
||||
#define MAXF_Z x7
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define MAXF s5
|
||||
#define TMPF s6
|
||||
#define TMPVF {v6.s}[0]
|
||||
#define SZ 4
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_F1
|
||||
ldr MAXF, [X], #SZ
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs MAXF, MAXF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
add Z, Z, #1
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
fcsel MAXF, MAXF, TMPF, le
|
||||
csel INDEX, INDEX, Z, le
|
||||
.endm
|
||||
|
||||
.macro INIT_F4
|
||||
ld1 {v0.4s}, [X], #16
|
||||
fabs v0.4s, v0.4s
|
||||
fmaxv MAXF, v0.4s
|
||||
mov Z, #5
|
||||
mov MAXF_Z, #1
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
ld1 {v0.4s}, [X], #16
|
||||
fabs v0.4s, v0.4s
|
||||
fmaxv TMPF, v0.4s
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
fcmp TMPF, MAXF
|
||||
fcsel MAXF, MAXF, TMPF, le
|
||||
csel MAXF_Z, MAXF_Z, Z, le
|
||||
add Z, Z, #4
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
mov INDEX, MAXF_Z
|
||||
sub MAXF_Z, MAXF_Z, #1
|
||||
lsl MAXF_Z, MAXF_Z, #2
|
||||
add X_COPY, X_COPY, MAXF_Z
|
||||
ldr TMPF, [X_COPY], #SZ
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
beq KERNEL_F4_FINALIZE_DONE
|
||||
add INDEX, INDEX, #1
|
||||
ldr TMPF, [X_COPY], #SZ
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
beq KERNEL_F4_FINALIZE_DONE
|
||||
add INDEX, INDEX, #1
|
||||
ldr TMPF, [X_COPY], #SZ
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
beq KERNEL_F4_FINALIZE_DONE
|
||||
add INDEX, INDEX, #1
|
||||
KERNEL_F4_FINALIZE_DONE:
|
||||
.endm
|
||||
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #2
|
||||
ld1 TMPVF, [X], INC_X
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs MAXF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
add Z, Z, #1
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
fcsel MAXF, MAXF, TMPF, le
|
||||
csel INDEX, INDEX, Z, le
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble iamax_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
PRFM PLDL1KEEP, [X]
|
||||
mov X_COPY, X
|
||||
|
||||
cmp INC_X, #1
|
||||
bne iamax_kernel_S_BEGIN
|
||||
|
||||
iamax_kernel_F_BEGIN:
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq iamax_kernel_F1_INIT
|
||||
|
||||
INIT_F4
|
||||
subs I, I, #1
|
||||
beq iamax_kernel_F4_FINALIZE
|
||||
|
||||
iamax_kernel_F4:
|
||||
KERNEL_F4
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F4
|
||||
|
||||
iamax_kernel_F4_FINALIZE:
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
iamax_kernel_F1:
|
||||
ands I, N, #3
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F10:
|
||||
KERNEL_F1
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F10
|
||||
b iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F1_INIT:
|
||||
INIT_F1
|
||||
subs N, N, #1
|
||||
b iamax_kernel_F1
|
||||
|
||||
iamax_kernel_S_BEGIN:
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble iamax_kernel_S1
|
||||
|
||||
iamax_kernel_S4:
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S4
|
||||
|
||||
iamax_kernel_S1:
|
||||
ands I, N, #3
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S10:
|
||||
KERNEL_S1
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S10
|
||||
|
||||
iamax_kernel_L999:
|
||||
mov x0, INDEX
|
||||
ret
|
||||
|
||||
iamax_kernel_zero:
|
||||
mov x0, xzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -59,10 +59,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro INIT_F1
|
||||
#if !defined(DOUBLE)
|
||||
fneg s2, S
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub s2, s2, S
|
||||
ins v1.s[1], v2.s[0] // [-S, S]
|
||||
#else
|
||||
fneg d2, S
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub d2, d2, S
|
||||
ins v1.d[1], v2.d[0] // [-S, S]
|
||||
#endif
|
||||
.endm
|
||||
|
|
|
@ -43,14 +43,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
#define DA_R s0 /* scale input value */
|
||||
#define DA_I s1 /* scale input value */
|
||||
#define TMPX v2.2s
|
||||
#define TMPY v3.2s
|
||||
#define SZ 4
|
||||
#else
|
||||
#define DA_R d0 /* scale input value */
|
||||
#define DA_I d1 /* scale input value */
|
||||
#define TMPX v2.2d
|
||||
#define TMPY v3.2d
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
|
@ -61,22 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(CONJ)
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
|
||||
fneg s2, DA_I
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub s2, s2, DA_I
|
||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
|
||||
fneg d2, DA_I
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub d2, d2, DA_I
|
||||
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
|
||||
#endif
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
fneg s2, DA_R
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub s2, s2, DA_R
|
||||
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R
|
||||
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I
|
||||
#else
|
||||
fneg d2, DA_R
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub d2, d2, DA_R
|
||||
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I
|
||||
#endif
|
||||
|
@ -111,9 +111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL_INIT_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
// Replicate the lower 2 floats into the upper 2 slots
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
|
||||
ins v16.s[0], v0.s[0]
|
||||
ins v16.s[1], v16.s[0]
|
||||
ins v16.d[1], v16.d[0]
|
||||
#if !defined(CONJ)
|
||||
ins v17.s[0], v1.s[1]
|
||||
#else
|
||||
ins v17.s[0], v1.s[0]
|
||||
#endif
|
||||
ins v17.s[1], v17.s[0]
|
||||
ins v17.d[1], v17.d[0]
|
||||
#else //DOUBLE
|
||||
ins v16.d[0], v0.d[0]
|
||||
ins v16.d[1], v16.d[0]
|
||||
#if !defined(CONJ)
|
||||
ins v17.d[0], v1.d[1]
|
||||
#else
|
||||
ins v17.d[0], v1.d[0]
|
||||
#endif
|
||||
ins v17.d[1], v17.d[0]
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
@ -121,55 +137,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0]
|
||||
// V3 = X[7], X[6], X[5], X[4]
|
||||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
|
||||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
|
||||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
|
||||
ld2 {v2.4s, v3.4s}, [X], #32
|
||||
ld2 {v4.4s, v5.4s}, [Y_COPY], #32
|
||||
|
||||
ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0]
|
||||
// V5 = Y[7], Y[6], Y[5], Y[4]
|
||||
fmla v4.4s, v2.4s, v16.4s
|
||||
#if !defined(CONJ)
|
||||
fmls v4.4s, v3.4s, v17.4s
|
||||
#else
|
||||
fmla v4.4s, v3.4s, v17.4s
|
||||
#endif
|
||||
|
||||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
|
||||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
|
||||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
|
||||
#if !defined(CONJ)
|
||||
fmla v5.4s, v2.4s, v17.4s
|
||||
#else
|
||||
fmls v5.4s, v2.4s, v17.4s
|
||||
#endif
|
||||
fmla v5.4s, v3.4s, v16.4s
|
||||
|
||||
fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v4.4s}, [Y], #16
|
||||
|
||||
fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix]
|
||||
fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v5.4s}, [Y], #16
|
||||
st2 {v4.4s, v5.4s}, [Y], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3
|
||||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
|
||||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
|
||||
ld2 {v2.2d, v3.2d}, [X], #32
|
||||
ld2 {v4.2d, v5.2d}, [Y_COPY], #32
|
||||
|
||||
ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3
|
||||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
|
||||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
|
||||
fmla v4.2d, v2.2d, v16.2d
|
||||
#if !defined(CONJ)
|
||||
fmls v4.2d, v3.2d, v17.2d
|
||||
#else
|
||||
fmla v4.2d, v3.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v5.2d, v2.2d, v17.2d
|
||||
#else
|
||||
fmls v5.2d, v2.2d, v17.2d
|
||||
#endif
|
||||
fmla v5.2d, v3.2d, v16.2d
|
||||
|
||||
ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
|
||||
st2 {v4.2d, v5.2d}, [Y], #32
|
||||
|
||||
fmla v16.2d, v0.2d, v2.2d
|
||||
fmla v17.2d, v0.2d, v3.2d
|
||||
ld2 {v18.2d, v19.2d}, [X], #32
|
||||
ld2 {v20.2d, v21.2d}, [Y_COPY], #32
|
||||
|
||||
ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
|
||||
fmla v20.2d, v18.2d, v16.2d
|
||||
#if !defined(CONJ)
|
||||
fmls v20.2d, v19.2d, v17.2d
|
||||
#else
|
||||
fmla v20.2d, v19.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v21.2d, v18.2d, v17.2d
|
||||
#else
|
||||
fmls v21.2d, v18.2d, v17.2d
|
||||
#endif
|
||||
fmla v21.2d, v19.2d, v16.2d
|
||||
|
||||
fmla v16.2d, v1.2d, v20.2d
|
||||
fmla v17.2d, v1.2d, v21.2d
|
||||
st1 {v16.2d,v17.2d}, [Y], #32
|
||||
|
||||
fmla v18.2d, v0.2d, v4.2d
|
||||
fmla v19.2d, v0.2d, v5.2d
|
||||
fmla v18.2d, v1.2d, v22.2d
|
||||
fmla v19.2d, v1.2d, v23.2d
|
||||
st1 {v18.2d,v19.2d}, [Y], #32
|
||||
st2 {v20.2d, v21.2d}, [Y], #32
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
PRFM PLDL1KEEP, [Y, #512]
|
||||
|
|
|
@ -184,73 +184,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v17.2d, v17.2d
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.2d, v0.2d, v9.2d[0]
|
||||
#else
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v19.2d, v19.2d
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.2d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v21.2d, v21.2d
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.2d, v0.2d, v9.2d[1]
|
||||
#else
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v23.2d, v23.2d
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.2d, v2.2d, v9.2d[1]
|
||||
#else
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v25.2d, v25.2d
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.2d, v0.2d, v11.2d[0]
|
||||
#else
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v27.2d, v27.2d
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.2d, v2.2d, v11.2d[0]
|
||||
#else
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v29.2d, v29.2d
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.2d, v0.2d, v11.2d[1]
|
||||
#else
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v31.2d, v31.2d
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.2d, v2.2d, v11.2d[1]
|
||||
#else
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
|
||||
|
|
|
@ -110,15 +110,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
/******* INIT FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
|
||||
fneg s2, ALPHA_I
|
||||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub s2, s2, ALPHA_I
|
||||
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA)
|
||||
#if !defined(XCONJ)
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA)
|
||||
#endif
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
|
||||
fneg d2, ALPHA_I
|
||||
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub d2, d2, ALPHA_I
|
||||
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA)
|
||||
#if !defined(XCONJ)
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA)
|
||||
|
@ -156,8 +158,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)]
|
||||
fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)]
|
||||
eor v12.16b, v12.16b, v12.16b
|
||||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
|
||||
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
@ -170,24 +172,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ins v3.s[0], v2.s[1]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fneg s4, s3
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub s4, s4, s3
|
||||
ins v3.s[1], v4.s[0]
|
||||
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)]
|
||||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg s4, s3
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub s4, s4, s3
|
||||
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)]
|
||||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)]
|
||||
fneg s4, s2
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub s4, s4, s2
|
||||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg s3, s3
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub s3, s4, s3
|
||||
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)]
|
||||
fneg s4, s2
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub s4, s4, s2
|
||||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
@ -220,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)]
|
||||
fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)]
|
||||
eor v12.16b, v12.16b, v12.16b
|
||||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
|
||||
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
@ -234,24 +241,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ins v3.d[0], v2.d[1] // I(TEMP)
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fneg d4, d3 // -I(TEMP)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub d4, d4, d3
|
||||
ins v3.d[1], v4.d[0]
|
||||
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)]
|
||||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg d4, d3 // -I(TEMP)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub d4, d4, d3
|
||||
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)]
|
||||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)]
|
||||
fneg d4, d2 // -R(TEMP)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub d4, d4, d2
|
||||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg d3, d3 // -I(TEMP)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub d3, d4, d3
|
||||
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)]
|
||||
fneg d4, d2 // -R(TEMP)
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
fsub d4, d4, d2
|
||||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
|
|
@ -96,22 +96,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(XCONJ)
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
|
||||
fneg s2, ALPHA_I
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub s2, s2, ALPHA_I
|
||||
ins v1.s[1], v2.s[0]
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
|
||||
fneg d2, ALPHA_I
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub d2, d2, ALPHA_I
|
||||
ins v1.d[1], v2.d[0]
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
|
||||
#endif
|
||||
#else // XCONJ
|
||||
#if !defined(DOUBLE)
|
||||
fneg s2, ALPHA_R
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub s2, s2, ALPHA_R
|
||||
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
|
||||
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
|
||||
#else
|
||||
fneg d2, ALPHA_R
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub d2, d2, ALPHA_R
|
||||
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
|
||||
#endif
|
||||
|
@ -136,89 +140,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld2 {v11.4s, v12.4s}, [X_PTR], #32
|
||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
|
||||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
|
||||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
|
||||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
|
||||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
|
||||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
|
||||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
|
||||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
|
||||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
#else // DOUBLE
|
||||
ld2 {v11.2d, v12.2d}, [X_PTR], #32
|
||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [X_PTR, #512]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
|
||||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
|
||||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
|
||||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
|
||||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
|
||||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
|
||||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
|
||||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
|
||||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
ld2 {v17.2d, v18.2d}, [X_PTR], #32
|
||||
ld2 {v19.2d, v20.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [A_PTR, #512]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
|
||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
#endif //DOUBLE
|
||||
.endm
|
||||
|
||||
|
@ -252,7 +218,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
|
||||
ld1 {v5.s}[0], [A_PTR], #4 // A1
|
||||
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
|
||||
fneg s16, s5
|
||||
eor v16.16b, v16.16b, v16.16b
|
||||
fsub s16, s16, s5
|
||||
ins v5.s[1], v16.s[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
|
||||
|
@ -264,7 +231,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
|
||||
ld1 {v5.d}[0], [A_PTR], #8 // A1
|
||||
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
|
||||
fneg d16, d5
|
||||
eor v16.16b, v16.16b, v16.16b
|
||||
fsub d16, d16, d5
|
||||
ins v5.d[1], v16.d[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
|
||||
|
@ -284,7 +252,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
|
||||
ld1 {v5.s}[0], [A_PTR], #4 // A1
|
||||
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
|
||||
fneg s16, s5
|
||||
eor v16.16b, v16.16b, v16.16b
|
||||
fsub s16, s16, s5
|
||||
ins v5.s[1], v16.s[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
|
||||
|
@ -296,7 +265,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
|
||||
ld1 {v5.d}[0], [A_PTR], #8 // A1
|
||||
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
|
||||
fneg d16, d5
|
||||
eor v16.16b, v16.16b, v16.16b
|
||||
fsub d16, d16, d5
|
||||
ins v5.d[1], v16.d[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
|
||||
|
|
|
@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define X x3 /* X vector address */
|
||||
#define INC_X x4 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
#define X_COPY x6 /* Copy of X */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
|
@ -50,43 +51,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro INIT
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
|
||||
fneg s2, DA_I
|
||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
|
||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
|
||||
fneg d2, DA_I
|
||||
ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X] // X1, X0
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
|
||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
|
||||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2s}, [X], #8
|
||||
fmul s3, DA_R, v2.s[0] // DA_R*X0
|
||||
fmul s5, DA_I, v2.s[1] // DA_I*X1
|
||||
fsub s3, s3, s5 // DA_R*X0-DA_I*X1
|
||||
|
||||
fmul s4, DA_I, v2.s[0] // DA_I*X0
|
||||
fmul s5, DA_R, v2.s[1] // DA_R*X1
|
||||
fadd s4, s4, s5 // DA_I*X0+DA_R*X1
|
||||
|
||||
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v3.2s}, [X], #8
|
||||
#else
|
||||
ld1 {v2.2d}, [X] // X1, X0
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
|
||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
|
||||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2d}, [X], #16
|
||||
#endif
|
||||
fmul d3, DA_R, v2.d[0] // DA_R*X0
|
||||
fmul d5, DA_I, v2.d[1] // DA_I*X1
|
||||
fsub d3, d3, d5 // DA_R*X0-DA_I*X1
|
||||
|
||||
fmul d4, DA_I, v2.d[0] // DA_I*X0
|
||||
fmul d5, DA_R, v2.d[1] // DA_R*X1
|
||||
fadd d4, d4, d5 // DA_I*X0+DA_R*X1
|
||||
|
||||
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v3.2d}, [X], #16
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_INIT_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
// Replicate the lower 2 floats into the upper 2 slots
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
|
||||
ins v16.s[0], v0.s[0]
|
||||
ins v16.s[1], v16.s[0]
|
||||
ins v16.d[1], v16.d[0]
|
||||
ins v17.s[0], v1.s[0]
|
||||
ins v17.s[1], v17.s[0]
|
||||
ins v17.d[1], v17.d[0]
|
||||
#else //DOUBLE
|
||||
ins v16.d[0], v0.d[0]
|
||||
ins v16.d[1], v16.d[0]
|
||||
ins v17.d[0], v1.d[0]
|
||||
ins v17.d[1], v17.d[0]
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
@ -94,46 +107,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0]
|
||||
// V3 = X[7], X[6], X[5], X[4]
|
||||
ld2 {v2.4s, v3.4s}, [X], #32
|
||||
|
||||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
|
||||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
|
||||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
|
||||
fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix]
|
||||
// X'[ix+1] += DA_R * X[ix+1]
|
||||
fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1]
|
||||
// X'[ix+1] += DA_I * X[ix]
|
||||
fmul v4.4s, v2.4s, v16.4s
|
||||
fmul v6.4s, v3.4s, v17.4s
|
||||
fsub v4.4s, v4.4s, v6.4s
|
||||
|
||||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
|
||||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
|
||||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
|
||||
fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix]
|
||||
// X'[ix+1] += DA_R * X[ix+1]
|
||||
fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1]
|
||||
// X'[ix+1] += DA_I * X[ix]
|
||||
fmul v5.4s, v2.4s, v17.4s
|
||||
fmul v6.4s, v3.4s, v16.4s
|
||||
fadd v5.4s, v5.4s, v6.4s
|
||||
|
||||
st1 {v2.4s,v3.4s}, [X], #32
|
||||
st2 {v4.4s, v5.4s}, [X_COPY], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3
|
||||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
|
||||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
|
||||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
|
||||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
|
||||
ld2 {v2.2d, v3.2d}, [X], #32
|
||||
|
||||
fmul v2.2d, v0.2d, v2.2d
|
||||
fmla v2.2d, v1.2d, v20.2d
|
||||
fmul v4.2d, v2.2d, v16.2d
|
||||
fmul v6.2d, v3.2d, v17.2d
|
||||
fsub v4.2d, v4.2d, v6.2d
|
||||
fmul v5.2d, v2.2d, v17.2d
|
||||
fmul v6.2d, v3.2d, v16.2d
|
||||
fadd v5.2d, v5.2d, v6.2d
|
||||
|
||||
fmul v3.2d, v0.2d, v3.2d
|
||||
fmla v3.2d, v1.2d, v21.2d
|
||||
st1 {v2.2d,v3.2d}, [X], #32
|
||||
st2 {v4.2d, v5.2d}, [X_COPY], #32
|
||||
|
||||
fmul v4.2d, v0.2d, v4.2d
|
||||
fmla v4.2d, v1.2d, v22.2d
|
||||
ld2 {v18.2d, v19.2d}, [X], #32
|
||||
|
||||
fmul v5.2d, v0.2d, v5.2d
|
||||
fmla v5.2d, v1.2d, v23.2d
|
||||
st1 {v4.2d,v5.2d}, [X], #32
|
||||
fmul v20.2d, v18.2d, v16.2d
|
||||
fmul v6.2d, v19.2d, v17.2d
|
||||
fsub v20.2d, v20.2d, v6.2d
|
||||
fmul v21.2d, v18.2d, v17.2d
|
||||
fmul v6.2d, v19.2d, v16.2d
|
||||
fadd v21.2d, v21.2d, v6.2d
|
||||
|
||||
st2 {v20.2d, v21.2d}, [X_COPY], #32
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
@ -149,21 +155,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X] // X1, X0
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
|
||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
|
||||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2s}, [X], INC_X
|
||||
fmul s3, DA_R, v2.s[0] // DA_R*X0
|
||||
fmul s5, DA_I, v2.s[1] // DA_I*X1
|
||||
fsub s3, s3, s5 // DA_R*X0-DA_I*X1
|
||||
|
||||
fmul s4, DA_I, v2.s[0] // DA_I*X0
|
||||
fmul s5, DA_R, v2.s[1] // DA_R*X1
|
||||
fadd s4, s4, s5 // DA_I*X0+DA_R*X1
|
||||
|
||||
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v3.2s}, [X], INC_X
|
||||
#else
|
||||
ld1 {v2.2d}, [X] // X1, X0
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
|
||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
|
||||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2d}, [X], INC_X
|
||||
#endif
|
||||
fmul d3, DA_R, v2.d[0] // DA_R*X0
|
||||
fmul d5, DA_I, v2.d[1] // DA_I*X1
|
||||
fsub d3, d3, d5 // DA_R*X0-DA_I*X1
|
||||
|
||||
fmul d4, DA_I, v2.d[0] // DA_I*X0
|
||||
fmul d5, DA_R, v2.d[1] // DA_R*X1
|
||||
fadd d4, d4, d5 // DA_I*X0+DA_R*X1
|
||||
|
||||
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v3.2d}, [X], INC_X
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
|
@ -171,21 +187,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
b zscal_begin
|
||||
data_ar:
|
||||
.word 0x3e44fae6
|
||||
data_ai:
|
||||
.word 0x3d320fa2
|
||||
data_xr:
|
||||
.word 0x3f4baff1
|
||||
data_xi:
|
||||
.word 0xbe8ef0bd
|
||||
|
||||
zscal_begin:
|
||||
|
||||
ldr s20, data_ar
|
||||
ldr s21, data_ai
|
||||
ldr s22, data_xr
|
||||
ldr s23, data_xi
|
||||
|
||||
fmul s24, s22, s21
|
||||
fmla s24, s23, v20.s[0]
|
||||
|
||||
fmul s25, s22, s21
|
||||
fmul s26, s23, s20
|
||||
fadd s25, s25, s26
|
||||
|
||||
mov X_COPY, X
|
||||
|
||||
cmp N, xzr
|
||||
ble zscal_kernel_L999
|
||||
|
||||
fcmp DA_R, #0.0
|
||||
bne zscal_kernel_1
|
||||
bne zscal_kernel_R_non_zero
|
||||
|
||||
fcmp DA_I, #0.0
|
||||
beq zscal_kernel_zero
|
||||
beq zscal_kernel_RI_zero
|
||||
|
||||
// TODO: special case DA_R == 0 && DA_I != 0
|
||||
b zscal_kernel_R_zero
|
||||
|
||||
zscal_kernel_1:
|
||||
zscal_kernel_R_non_zero:
|
||||
|
||||
// TODO: special case DA_R != 0 && DA_I == 0
|
||||
fcmp DA_I, #0.0
|
||||
beq zscal_kernel_I_zero
|
||||
|
||||
/*******************************************************************************
|
||||
* A_R != 0 && A_I != 0
|
||||
*******************************************************************************/
|
||||
|
||||
zscal_kernel_RI_non_zero:
|
||||
|
||||
INIT
|
||||
|
||||
|
@ -257,16 +306,85 @@ zscal_kernel_L999:
|
|||
mov w0, wzr
|
||||
ret
|
||||
|
||||
zscal_kernel_zero:
|
||||
/*******************************************************************************
|
||||
* A_R == 0 && A_I != 0
|
||||
*******************************************************************************/
|
||||
|
||||
zscal_kernel_R_zero:
|
||||
INIT_S
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub s2, s2, DA_I
|
||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
|
||||
#else
|
||||
eor v2.16b, v2.16b, v2.16b
|
||||
fsub d2, d2, DA_I
|
||||
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
|
||||
#endif
|
||||
|
||||
zscal_kernel_R_zero_1:
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X] // X1, X0
|
||||
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
|
||||
ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1
|
||||
st1 {v2.2s}, [X]
|
||||
#else
|
||||
ld1 {v2.2d}, [X] // X1, X0
|
||||
fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0
|
||||
ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1
|
||||
st1 {v2.2d}, [X]
|
||||
#endif
|
||||
add X, X, INC_X
|
||||
subs N, N, #1
|
||||
bne zscal_kernel_R_zero_1
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
/*******************************************************************************
|
||||
* A_R != 0 && A_I == 0
|
||||
*******************************************************************************/
|
||||
|
||||
zscal_kernel_I_zero:
|
||||
INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
|
||||
#endif
|
||||
|
||||
zscal_kernel_I_zero_1:
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X] // X1, X0
|
||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
|
||||
st1 {v2.2s}, [X]
|
||||
#else
|
||||
ld1 {v2.2d}, [X] // X1, X0
|
||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
|
||||
st1 {v2.2d}, [X]
|
||||
#endif
|
||||
add X, X, INC_X
|
||||
subs N, N, #1
|
||||
bne zscal_kernel_I_zero_1
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
/*******************************************************************************
|
||||
* A_R == 0 && A_I == 0
|
||||
*******************************************************************************/
|
||||
|
||||
zscal_kernel_RI_zero:
|
||||
|
||||
INIT_S
|
||||
|
||||
zscal_kernel_Z1:
|
||||
zscal_kernel_RI_zero_1:
|
||||
|
||||
stp DA_R, DA_I, [X]
|
||||
add X, X, INC_X
|
||||
subs N, N, #1
|
||||
bne zscal_kernel_Z1
|
||||
subs N, N, #1
|
||||
bne zscal_kernel_RI_zero_1
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
|
|
@ -187,73 +187,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
OP_ii v16.2d, v1.2d, v9.2d[0]
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v17.2d, v17.2d
|
||||
eor v17.16b, v17.16b, v17.16b
|
||||
fmls v17.2d, v0.2d, v9.2d[0]
|
||||
#else
|
||||
fmul v17.2d, v0.2d, v9.2d[0]
|
||||
#endif
|
||||
OP_ir v17.2d, v1.2d, v8.2d[0]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
OP_ii v18.2d, v3.2d, v9.2d[0]
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v19.2d, v19.2d
|
||||
eor v19.16b, v19.16b, v19.16b
|
||||
fmls v19.2d, v2.2d, v9.2d[0]
|
||||
#else
|
||||
fmul v19.2d, v2.2d, v9.2d[0]
|
||||
#endif
|
||||
OP_ir v19.2d, v3.2d, v8.2d[0]
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
OP_ii v20.2d, v1.2d, v9.2d[1]
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v21.2d, v21.2d
|
||||
eor v21.16b, v21.16b, v21.16b
|
||||
fmls v21.2d, v0.2d, v9.2d[1]
|
||||
#else
|
||||
fmul v21.2d, v0.2d, v9.2d[1]
|
||||
#endif
|
||||
OP_ir v21.2d, v1.2d, v8.2d[1]
|
||||
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
OP_ii v22.2d, v3.2d, v9.2d[1]
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v23.2d, v23.2d
|
||||
eor v23.16b, v23.16b, v23.16b
|
||||
fmls v23.2d, v2.2d, v9.2d[1]
|
||||
#else
|
||||
fmul v23.2d, v2.2d, v9.2d[1]
|
||||
#endif
|
||||
OP_ir v23.2d, v3.2d, v8.2d[1]
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
OP_ii v24.2d, v1.2d, v11.2d[0]
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v25.2d, v25.2d
|
||||
eor v25.16b, v25.16b, v25.16b
|
||||
fmls v25.2d, v0.2d, v11.2d[0]
|
||||
#else
|
||||
fmul v25.2d, v0.2d, v11.2d[0]
|
||||
#endif
|
||||
OP_ir v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
OP_ii v26.2d, v3.2d, v11.2d[0]
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v27.2d, v27.2d
|
||||
eor v27.16b, v27.16b, v27.16b
|
||||
fmls v27.2d, v2.2d, v11.2d[0]
|
||||
#else
|
||||
fmul v27.2d, v2.2d, v11.2d[0]
|
||||
#endif
|
||||
OP_ir v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
fmul v28.2d, v0.2d, v10.2d[1]
|
||||
OP_ii v28.2d, v1.2d, v11.2d[1]
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v29.2d, v29.2d
|
||||
eor v29.16b, v29.16b, v29.16b
|
||||
fmls v29.2d, v0.2d, v11.2d[1]
|
||||
#else
|
||||
fmul v29.2d, v0.2d, v11.2d[1]
|
||||
#endif
|
||||
OP_ir v29.2d, v1.2d, v10.2d[1]
|
||||
|
||||
fmul v30.2d, v2.2d, v10.2d[1]
|
||||
OP_ii v30.2d, v3.2d, v11.2d[1]
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
fneg v31.2d, v31.2d
|
||||
eor v31.16b, v31.16b, v31.16b
|
||||
fmls v31.2d, v2.2d, v11.2d[1]
|
||||
#else
|
||||
fmul v31.2d, v2.2d, v11.2d[1]
|
||||
#endif
|
||||
OP_ir v31.2d, v3.2d, v10.2d[1]
|
||||
|
||||
|
|
Loading…
Reference in New Issue