lapack-test fixes for Cortex A57

This commit is contained in:
Ashwin Sekhar T K 2015-11-20 01:15:04 +05:30
parent 39937d15cd
commit 98965da2e8
12 changed files with 452 additions and 475 deletions

View File

@ -5,8 +5,8 @@ DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
ISAMAXKERNEL = isamax.S
IDAMAXKERNEL = idamax.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
@ -25,22 +25,22 @@ DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
DOTKERNEL = dot.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
SNRM2KERNEL = snrm2.S
DNRM2KERNEL = dnrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
#SNRM2KERNEL = snrm2.S
#DNRM2KERNEL = dnrm2.S
#CNRM2KERNEL = znrm2.S
#ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SCALKERNEL = scal.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

View File

@ -181,73 +181,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v16.4s, v0.4s, v8.4s[0]
OP_ii v16.4s, v1.4s, v9.4s[0]
fmul v17.4s, v0.4s, v9.4s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v17.4s, v17.4s
eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0]
#else
fmul v17.4s, v0.4s, v9.4s[0]
#endif
OP_ir v17.4s, v1.4s, v8.4s[0]
fmul v20.4s, v0.4s, v8.4s[1]
OP_ii v20.4s, v1.4s, v9.4s[1]
fmul v21.4s, v0.4s, v9.4s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v21.4s, v21.4s
eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1]
#else
fmul v21.4s, v0.4s, v9.4s[1]
#endif
OP_ir v21.4s, v1.4s, v8.4s[1]
fmul v24.4s, v0.4s, v8.4s[2]
OP_ii v24.4s, v1.4s, v9.4s[2]
fmul v25.4s, v0.4s, v9.4s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v25.4s, v25.4s
eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2]
#else
fmul v25.4s, v0.4s, v9.4s[2]
#endif
OP_ir v25.4s, v1.4s, v8.4s[2]
fmul v28.4s, v0.4s, v8.4s[3]
OP_ii v28.4s, v1.4s, v9.4s[3]
fmul v29.4s, v0.4s, v9.4s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v29.4s, v29.4s
eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3]
#else
fmul v29.4s, v0.4s, v9.4s[3]
#endif
OP_ir v29.4s, v1.4s, v8.4s[3]
fmul v18.4s, v2.4s, v8.4s[0]
OP_ii v18.4s, v3.4s, v9.4s[0]
fmul v19.4s, v2.4s, v9.4s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v19.4s, v19.4s
eor v19.16b, v19.16b, v19.16b
fmls v19.4s, v2.4s, v9.4s[0]
#else
fmul v19.4s, v2.4s, v9.4s[0]
#endif
OP_ir v19.4s, v3.4s, v8.4s[0]
fmul v22.4s, v2.4s, v8.4s[1]
OP_ii v22.4s, v3.4s, v9.4s[1]
fmul v23.4s, v2.4s, v9.4s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v23.4s, v23.4s
eor v23.16b, v23.16b, v23.16b
fmls v23.4s, v2.4s, v9.4s[1]
#else
fmul v23.4s, v2.4s, v9.4s[1]
#endif
OP_ir v23.4s, v3.4s, v8.4s[1]
fmul v26.4s, v2.4s, v8.4s[2]
OP_ii v26.4s, v3.4s, v9.4s[2]
fmul v27.4s, v2.4s, v9.4s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v27.4s, v27.4s
eor v27.16b, v27.16b, v27.16b
fmls v27.4s, v2.4s, v9.4s[2]
#else
fmul v27.4s, v2.4s, v9.4s[2]
#endif
OP_ir v27.4s, v3.4s, v8.4s[2]
fmul v30.4s, v2.4s, v8.4s[3]
OP_ii v30.4s, v3.4s, v9.4s[3]
fmul v31.4s, v2.4s, v9.4s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v31.4s, v31.4s
eor v31.16b, v31.16b, v31.16b
fmls v31.4s, v2.4s, v9.4s[3]
#else
fmul v31.4s, v2.4s, v9.4s[3]
#endif
OP_ir v31.4s, v3.4s, v8.4s[3]

View File

@ -172,37 +172,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v16.4s, v0.4s, v8.4s[0]
OP_ii v16.4s, v1.4s, v9.4s[0]
fmul v17.4s, v0.4s, v9.4s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v17.4s, v17.4s
eor v17.16b, v17.16b, v17.16b
fmls v17.4s, v0.4s, v9.4s[0]
#else
fmul v17.4s, v0.4s, v9.4s[0]
#endif
OP_ir v17.4s, v1.4s, v8.4s[0]
fmul v20.4s, v0.4s, v8.4s[1]
OP_ii v20.4s, v1.4s, v9.4s[1]
fmul v21.4s, v0.4s, v9.4s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v21.4s, v21.4s
eor v21.16b, v21.16b, v21.16b
fmls v21.4s, v0.4s, v9.4s[1]
#else
fmul v21.4s, v0.4s, v9.4s[1]
#endif
OP_ir v21.4s, v1.4s, v8.4s[1]
fmul v24.4s, v0.4s, v8.4s[2]
OP_ii v24.4s, v1.4s, v9.4s[2]
fmul v25.4s, v0.4s, v9.4s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v25.4s, v25.4s
eor v25.16b, v25.16b, v25.16b
fmls v25.4s, v0.4s, v9.4s[2]
#else
fmul v25.4s, v0.4s, v9.4s[2]
#endif
OP_ir v25.4s, v1.4s, v8.4s[2]
fmul v28.4s, v0.4s, v8.4s[3]
OP_ii v28.4s, v1.4s, v9.4s[3]
fmul v29.4s, v0.4s, v9.4s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v29.4s, v29.4s
eor v29.16b, v29.16b, v29.16b
fmls v29.4s, v0.4s, v9.4s[3]
#else
fmul v29.4s, v0.4s, v9.4s[3]
#endif
OP_ir v29.4s, v1.4s, v8.4s[3]

View File

@ -45,16 +45,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define COND ge
#endif
#if !defined(DOUBLE)
#define MAXF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
ld1 {v0.s}[0], [X], INC_X
#else
lsl INC_X, INC_X, #3
ld1 {v0.d}[0], [X], INC_X
#endif
mov Z, #1
mov INDEX, Z
fabs MAXF, MAXF
@ -107,9 +119,8 @@ iamax_kernel_S1:
iamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
subs I, I, #1
bne iamax_kernel_S10
iamax_kernel_L999:

View File

@ -1,213 +0,0 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define INDEX x3 /* index of max/min value */
#define Z x4 /* vector index */
#define I x5 /* loop variable */
#define X_COPY x6 /* copy of X address */
#define MAXF_Z x7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define MAXF s5
#define TMPF s6
#define TMPVF {v6.s}[0]
#define SZ 4
/******************************************************************************/
.macro INIT_F1
ldr MAXF, [X], #SZ
mov Z, #1
mov INDEX, Z
fabs MAXF, MAXF
.endm
.macro KERNEL_F1
ldr TMPF, [X], #SZ
add Z, Z, #1
fabs TMPF, TMPF
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel INDEX, INDEX, Z, le
.endm
.macro INIT_F4
ld1 {v0.4s}, [X], #16
fabs v0.4s, v0.4s
fmaxv MAXF, v0.4s
mov Z, #5
mov MAXF_Z, #1
.endm
.macro KERNEL_F4
ld1 {v0.4s}, [X], #16
fabs v0.4s, v0.4s
fmaxv TMPF, v0.4s
PRFM PLDL1KEEP, [X, #512]
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel MAXF_Z, MAXF_Z, Z, le
add Z, Z, #4
.endm
.macro KERNEL_F4_FINALIZE
mov INDEX, MAXF_Z
sub MAXF_Z, MAXF_Z, #1
lsl MAXF_Z, MAXF_Z, #2
add X_COPY, X_COPY, MAXF_Z
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
KERNEL_F4_FINALIZE_DONE:
.endm
.macro INIT_S
lsl INC_X, INC_X, #2
ld1 TMPVF, [X], INC_X
mov Z, #1
mov INDEX, Z
fabs MAXF, TMPF
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
fabs TMPF, TMPF
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel INDEX, INDEX, Z, le
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
PRFM PLDL1KEEP, [X]
mov X_COPY, X
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
iamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq iamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq iamax_kernel_F4_FINALIZE
iamax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne iamax_kernel_F4
iamax_kernel_F4_FINALIZE:
KERNEL_F4_FINALIZE
iamax_kernel_F1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b iamax_kernel_F1
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
iamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
iamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
iamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
mov x0, xzr
ret
EPILOGUE

View File

@ -59,10 +59,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
#if !defined(DOUBLE)
fneg s2, S
eor v2.16b, v2.16b, v2.16b
fsub s2, s2, S
ins v1.s[1], v2.s[0] // [-S, S]
#else
fneg d2, S
eor v2.16b, v2.16b, v2.16b
fsub d2, d2, S
ins v1.d[1], v2.d[0] // [-S, S]
#endif
.endm

View File

@ -43,14 +43,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define DA_R s0 /* scale input value */
#define DA_I s1 /* scale input value */
#define TMPX v2.2s
#define TMPY v3.2s
#define SZ 4
#else
#define DA_R d0 /* scale input value */
#define DA_I d1 /* scale input value */
#define TMPX v2.2d
#define TMPY v3.2d
#define SZ 8
#endif
@ -61,22 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
fneg s2, DA_I
eor v2.16b, v2.16b, v2.16b
fsub s2, s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
fneg d2, DA_I
eor v2.16b, v2.16b, v2.16b
fsub d2, d2, DA_I
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif
#else
#if !defined(DOUBLE)
fneg s2, DA_R
eor v2.16b, v2.16b, v2.16b
fsub s2, s2, DA_R
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I
#else
fneg d2, DA_R
eor v2.16b, v2.16b, v2.16b
fsub d2, d2, DA_R
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I
#endif
@ -111,9 +111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
// Replicate the lower 2 floats into the upper 2 slots
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
ins v16.s[0], v0.s[0]
ins v16.s[1], v16.s[0]
ins v16.d[1], v16.d[0]
#if !defined(CONJ)
ins v17.s[0], v1.s[1]
#else
ins v17.s[0], v1.s[0]
#endif
ins v17.s[1], v17.s[0]
ins v17.d[1], v17.d[0]
#else //DOUBLE
ins v16.d[0], v0.d[0]
ins v16.d[1], v16.d[0]
#if !defined(CONJ)
ins v17.d[0], v1.d[1]
#else
ins v17.d[0], v1.d[0]
#endif
ins v17.d[1], v17.d[0]
#endif
.endm
@ -121,55 +137,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0]
// V3 = X[7], X[6], X[5], X[4]
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
ld2 {v2.4s, v3.4s}, [X], #32
ld2 {v4.4s, v5.4s}, [Y_COPY], #32
ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0]
// V5 = Y[7], Y[6], Y[5], Y[4]
fmla v4.4s, v2.4s, v16.4s
#if !defined(CONJ)
fmls v4.4s, v3.4s, v17.4s
#else
fmla v4.4s, v3.4s, v17.4s
#endif
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
#if !defined(CONJ)
fmla v5.4s, v2.4s, v17.4s
#else
fmls v5.4s, v2.4s, v17.4s
#endif
fmla v5.4s, v3.4s, v16.4s
fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v4.4s}, [Y], #16
fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix]
fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += +-DA_R * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v5.4s}, [Y], #16
st2 {v4.4s, v5.4s}, [Y], #32
#else // DOUBLE
ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
ld2 {v2.2d, v3.2d}, [X], #32
ld2 {v4.2d, v5.2d}, [Y_COPY], #32
ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
fmla v4.2d, v2.2d, v16.2d
#if !defined(CONJ)
fmls v4.2d, v3.2d, v17.2d
#else
fmla v4.2d, v3.2d, v17.2d
#endif
#if !defined(CONJ)
fmla v5.2d, v2.2d, v17.2d
#else
fmls v5.2d, v2.2d, v17.2d
#endif
fmla v5.2d, v3.2d, v16.2d
ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
st2 {v4.2d, v5.2d}, [Y], #32
fmla v16.2d, v0.2d, v2.2d
fmla v17.2d, v0.2d, v3.2d
ld2 {v18.2d, v19.2d}, [X], #32
ld2 {v20.2d, v21.2d}, [Y_COPY], #32
ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
fmla v20.2d, v18.2d, v16.2d
#if !defined(CONJ)
fmls v20.2d, v19.2d, v17.2d
#else
fmla v20.2d, v19.2d, v17.2d
#endif
#if !defined(CONJ)
fmla v21.2d, v18.2d, v17.2d
#else
fmls v21.2d, v18.2d, v17.2d
#endif
fmla v21.2d, v19.2d, v16.2d
fmla v16.2d, v1.2d, v20.2d
fmla v17.2d, v1.2d, v21.2d
st1 {v16.2d,v17.2d}, [Y], #32
fmla v18.2d, v0.2d, v4.2d
fmla v19.2d, v0.2d, v5.2d
fmla v18.2d, v1.2d, v22.2d
fmla v19.2d, v1.2d, v23.2d
st1 {v18.2d,v19.2d}, [Y], #32
st2 {v20.2d, v21.2d}, [Y], #32
#endif
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]

View File

@ -184,73 +184,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v16.2d, v0.2d, v8.2d[0]
OP_ii v16.2d, v1.2d, v9.2d[0]
fmul v17.2d, v0.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v17.2d, v17.2d
eor v17.16b, v17.16b, v17.16b
fmls v17.2d, v0.2d, v9.2d[0]
#else
fmul v17.2d, v0.2d, v9.2d[0]
#endif
OP_ir v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0]
OP_ii v18.2d, v3.2d, v9.2d[0]
fmul v19.2d, v2.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v19.2d, v19.2d
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.2d[0]
#else
fmul v19.2d, v2.2d, v9.2d[0]
#endif
OP_ir v19.2d, v3.2d, v8.2d[0]
fmul v20.2d, v0.2d, v8.2d[1]
OP_ii v20.2d, v1.2d, v9.2d[1]
fmul v21.2d, v0.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v21.2d, v21.2d
eor v21.16b, v21.16b, v21.16b
fmls v21.2d, v0.2d, v9.2d[1]
#else
fmul v21.2d, v0.2d, v9.2d[1]
#endif
OP_ir v21.2d, v1.2d, v8.2d[1]
fmul v22.2d, v2.2d, v8.2d[1]
OP_ii v22.2d, v3.2d, v9.2d[1]
fmul v23.2d, v2.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v23.2d, v23.2d
eor v23.16b, v23.16b, v23.16b
fmls v23.2d, v2.2d, v9.2d[1]
#else
fmul v23.2d, v2.2d, v9.2d[1]
#endif
OP_ir v23.2d, v3.2d, v8.2d[1]
fmul v24.2d, v0.2d, v10.2d[0]
OP_ii v24.2d, v1.2d, v11.2d[0]
fmul v25.2d, v0.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v25.2d, v25.2d
eor v25.16b, v25.16b, v25.16b
fmls v25.2d, v0.2d, v11.2d[0]
#else
fmul v25.2d, v0.2d, v11.2d[0]
#endif
OP_ir v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0]
OP_ii v26.2d, v3.2d, v11.2d[0]
fmul v27.2d, v2.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v27.2d, v27.2d
eor v27.16b, v27.16b, v27.16b
fmls v27.2d, v2.2d, v11.2d[0]
#else
fmul v27.2d, v2.2d, v11.2d[0]
#endif
OP_ir v27.2d, v3.2d, v10.2d[0]
fmul v28.2d, v0.2d, v10.2d[1]
OP_ii v28.2d, v1.2d, v11.2d[1]
fmul v29.2d, v0.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v29.2d, v29.2d
eor v29.16b, v29.16b, v29.16b
fmls v29.2d, v0.2d, v11.2d[1]
#else
fmul v29.2d, v0.2d, v11.2d[1]
#endif
OP_ir v29.2d, v1.2d, v10.2d[1]
fmul v30.2d, v2.2d, v10.2d[1]
OP_ii v30.2d, v3.2d, v11.2d[1]
fmul v31.2d, v2.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v31.2d, v31.2d
eor v31.16b, v31.16b, v31.16b
fmls v31.2d, v2.2d, v11.2d[1]
#else
fmul v31.2d, v2.2d, v11.2d[1]
#endif
OP_ir v31.2d, v3.2d, v10.2d[1]

View File

@ -110,15 +110,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
fneg s2, ALPHA_I
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
eor v2.16b, v2.16b, v2.16b
fsub s2, s2, ALPHA_I
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA)
#endif
#else
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
fneg d2, ALPHA_I
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
eor v2.16b, v2.16b, v2.16b
fsub d2, d2, ALPHA_I
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA)
@ -156,8 +158,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)]
fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
@ -170,24 +172,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ins v3.s[0], v2.s[1]
#if !defined(CONJ)
#if !defined(XCONJ)
fneg s4, s3
eor v4.16b, v4.16b, v4.16b
fsub s4, s4, s3
ins v3.s[1], v4.s[0]
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#else
fneg s4, s3
eor v4.16b, v4.16b, v4.16b
fsub s4, s4, s3
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)]
fneg s4, s2
eor v4.16b, v4.16b, v4.16b
fsub s4, s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#else
fneg s3, s3
eor v4.16b, v4.16b, v4.16b
fsub s3, s4, s3
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)]
fneg s4, s2
eor v4.16b, v4.16b, v4.16b
fsub s4, s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ
@ -220,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)]
fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
@ -234,24 +241,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ins v3.d[0], v2.d[1] // I(TEMP)
#if !defined(CONJ)
#if !defined(XCONJ)
fneg d4, d3 // -I(TEMP)
eor v4.16b, v4.16b, v4.16b
fsub d4, d4, d3
ins v3.d[1], v4.d[0]
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#else
fneg d4, d3 // -I(TEMP)
eor v4.16b, v4.16b, v4.16b
fsub d4, d4, d3
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)]
fneg d4, d2 // -R(TEMP)
eor v4.16b, v4.16b, v4.16b
fsub d4, d4, d2
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#else
fneg d3, d3 // -I(TEMP)
eor v4.16b, v4.16b, v4.16b
fsub d3, d4, d3
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)]
fneg d4, d2 // -R(TEMP)
eor v4.16b, v4.16b, v4.16b
fsub d4, d4, d2
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ

View File

@ -96,22 +96,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(XCONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
fneg s2, ALPHA_I
eor v2.16b, v2.16b, v2.16b
fsub s2, s2, ALPHA_I
ins v1.s[1], v2.s[0]
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
#else
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
fneg d2, ALPHA_I
eor v2.16b, v2.16b, v2.16b
fsub d2, d2, ALPHA_I
ins v1.d[1], v2.d[0]
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
#endif
#else // XCONJ
#if !defined(DOUBLE)
fneg s2, ALPHA_R
eor v2.16b, v2.16b, v2.16b
fsub s2, s2, ALPHA_R
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
#else
fneg d2, ALPHA_R
eor v2.16b, v2.16b, v2.16b
fsub d2, d2, ALPHA_R
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
#endif
@ -136,89 +140,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
#else
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
#else
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
#endif
#endif // CONJ
#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #512]
#if !defined(CONJ)
#if !defined(XCONJ)
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
#else
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
#else
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
#endif
#endif // CONJ
ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
prfm PLDL1STRM, [A_PTR, #512]
#if !defined(CONJ)
#if !defined(XCONJ)
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#else
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#else
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#endif
#endif // CONJ
#endif //DOUBLE
.endm
@ -252,7 +218,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
fneg s16, s5
eor v16.16b, v16.16b, v16.16b
fsub s16, s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
@ -264,7 +231,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
fneg d16, d5
eor v16.16b, v16.16b, v16.16b
fsub d16, d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
@ -284,7 +252,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
fneg s16, s5
eor v16.16b, v16.16b, v16.16b
fsub s16, s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
@ -296,7 +265,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
fneg d16, d5
eor v16.16b, v16.16b, v16.16b
fsub d16, d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]

View File

@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define I x5 /* loop variable */
#define X_COPY x6 /* Copy of X */
/*******************************************************************************
* Macro definitions
@ -50,43 +51,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
fneg s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
fneg d2, DA_I
ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2s}, [X], #8
fmul s3, DA_R, v2.s[0] // DA_R*X0
fmul s5, DA_I, v2.s[1] // DA_I*X1
fsub s3, s3, s5 // DA_R*X0-DA_I*X1
fmul s4, DA_I, v2.s[0] // DA_I*X0
fmul s5, DA_R, v2.s[1] // DA_R*X1
fadd s4, s4, s5 // DA_I*X0+DA_R*X1
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v3.2s}, [X], #8
#else
ld1 {v2.2d}, [X] // X1, X0
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2d}, [X], #16
#endif
fmul d3, DA_R, v2.d[0] // DA_R*X0
fmul d5, DA_I, v2.d[1] // DA_I*X1
fsub d3, d3, d5 // DA_R*X0-DA_I*X1
fmul d4, DA_I, v2.d[0] // DA_I*X0
fmul d5, DA_R, v2.d[1] // DA_R*X1
fadd d4, d4, d5 // DA_I*X0+DA_R*X1
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v3.2d}, [X], #16
#endif
.endm
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
// Replicate the lower 2 floats into the upper 2 slots
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
ins v16.s[0], v0.s[0]
ins v16.s[1], v16.s[0]
ins v16.d[1], v16.d[0]
ins v17.s[0], v1.s[0]
ins v17.s[1], v17.s[0]
ins v17.d[1], v17.d[0]
#else //DOUBLE
ins v16.d[0], v0.d[0]
ins v16.d[1], v16.d[0]
ins v17.d[0], v1.d[0]
ins v17.d[1], v17.d[0]
#endif
.endm
@ -94,46 +107,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0]
// V3 = X[7], X[6], X[5], X[4]
ld2 {v2.4s, v3.4s}, [X], #32
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix]
// X'[ix+1] += DA_R * X[ix+1]
fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1]
// X'[ix+1] += DA_I * X[ix]
fmul v4.4s, v2.4s, v16.4s
fmul v6.4s, v3.4s, v17.4s
fsub v4.4s, v4.4s, v6.4s
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix]
// X'[ix+1] += DA_R * X[ix+1]
fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1]
// X'[ix+1] += DA_I * X[ix]
fmul v5.4s, v2.4s, v17.4s
fmul v6.4s, v3.4s, v16.4s
fadd v5.4s, v5.4s, v6.4s
st1 {v2.4s,v3.4s}, [X], #32
st2 {v4.4s, v5.4s}, [X_COPY], #32
#else // DOUBLE
ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
ld2 {v2.2d, v3.2d}, [X], #32
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v20.2d
fmul v4.2d, v2.2d, v16.2d
fmul v6.2d, v3.2d, v17.2d
fsub v4.2d, v4.2d, v6.2d
fmul v5.2d, v2.2d, v17.2d
fmul v6.2d, v3.2d, v16.2d
fadd v5.2d, v5.2d, v6.2d
fmul v3.2d, v0.2d, v3.2d
fmla v3.2d, v1.2d, v21.2d
st1 {v2.2d,v3.2d}, [X], #32
st2 {v4.2d, v5.2d}, [X_COPY], #32
fmul v4.2d, v0.2d, v4.2d
fmla v4.2d, v1.2d, v22.2d
ld2 {v18.2d, v19.2d}, [X], #32
fmul v5.2d, v0.2d, v5.2d
fmla v5.2d, v1.2d, v23.2d
st1 {v4.2d,v5.2d}, [X], #32
fmul v20.2d, v18.2d, v16.2d
fmul v6.2d, v19.2d, v17.2d
fsub v20.2d, v20.2d, v6.2d
fmul v21.2d, v18.2d, v17.2d
fmul v6.2d, v19.2d, v16.2d
fadd v21.2d, v21.2d, v6.2d
st2 {v20.2d, v21.2d}, [X_COPY], #32
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
@ -149,21 +155,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2s}, [X], INC_X
fmul s3, DA_R, v2.s[0] // DA_R*X0
fmul s5, DA_I, v2.s[1] // DA_I*X1
fsub s3, s3, s5 // DA_R*X0-DA_I*X1
fmul s4, DA_I, v2.s[0] // DA_I*X0
fmul s5, DA_R, v2.s[1] // DA_R*X1
fadd s4, s4, s5 // DA_I*X0+DA_R*X1
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v3.2s}, [X], INC_X
#else
ld1 {v2.2d}, [X] // X1, X0
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2d}, [X], INC_X
#endif
fmul d3, DA_R, v2.d[0] // DA_R*X0
fmul d5, DA_I, v2.d[1] // DA_I*X1
fsub d3, d3, d5 // DA_R*X0-DA_I*X1
fmul d4, DA_I, v2.d[0] // DA_I*X0
fmul d5, DA_R, v2.d[1] // DA_R*X1
fadd d4, d4, d5 // DA_I*X0+DA_R*X1
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v3.2d}, [X], INC_X
#endif
.endm
/*******************************************************************************
@ -171,21 +187,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
PROLOGUE
b zscal_begin
data_ar:
.word 0x3e44fae6
data_ai:
.word 0x3d320fa2
data_xr:
.word 0x3f4baff1
data_xi:
.word 0xbe8ef0bd
zscal_begin:
ldr s20, data_ar
ldr s21, data_ai
ldr s22, data_xr
ldr s23, data_xi
fmul s24, s22, s21
fmla s24, s23, v20.s[0]
fmul s25, s22, s21
fmul s26, s23, s20
fadd s25, s25, s26
mov X_COPY, X
cmp N, xzr
ble zscal_kernel_L999
fcmp DA_R, #0.0
bne zscal_kernel_1
bne zscal_kernel_R_non_zero
fcmp DA_I, #0.0
beq zscal_kernel_zero
beq zscal_kernel_RI_zero
// TODO: special case DA_R == 0 && DA_I != 0
b zscal_kernel_R_zero
zscal_kernel_1:
zscal_kernel_R_non_zero:
// TODO: special case DA_R != 0 && DA_I == 0
fcmp DA_I, #0.0
beq zscal_kernel_I_zero
/*******************************************************************************
* A_R != 0 && A_I != 0
*******************************************************************************/
zscal_kernel_RI_non_zero:
INIT
@ -257,16 +306,85 @@ zscal_kernel_L999:
mov w0, wzr
ret
zscal_kernel_zero:
/*******************************************************************************
* A_R == 0 && A_I != 0
*******************************************************************************/
zscal_kernel_R_zero:
INIT_S
#if !defined(DOUBLE)
eor v2.16b, v2.16b, v2.16b
fsub s2, s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
#else
eor v2.16b, v2.16b, v2.16b
fsub d2, d2, DA_I
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif
zscal_kernel_R_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1
st1 {v2.2s}, [X]
#else
ld1 {v2.2d}, [X] // X1, X0
fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0
ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1
st1 {v2.2d}, [X]
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_R_zero_1
mov w0, wzr
ret
/*******************************************************************************
* A_R != 0 && A_I == 0
*******************************************************************************/
zscal_kernel_I_zero:
INIT_S
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif
zscal_kernel_I_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
st1 {v2.2s}, [X]
#else
ld1 {v2.2d}, [X] // X1, X0
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
st1 {v2.2d}, [X]
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_I_zero_1
mov w0, wzr
ret
/*******************************************************************************
* A_R == 0 && A_I == 0
*******************************************************************************/
zscal_kernel_RI_zero:
INIT_S
zscal_kernel_Z1:
zscal_kernel_RI_zero_1:
stp DA_R, DA_I, [X]
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_Z1
subs N, N, #1
bne zscal_kernel_RI_zero_1
mov w0, wzr
ret

View File

@ -187,73 +187,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmul v16.2d, v0.2d, v8.2d[0]
OP_ii v16.2d, v1.2d, v9.2d[0]
fmul v17.2d, v0.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v17.2d, v17.2d
eor v17.16b, v17.16b, v17.16b
fmls v17.2d, v0.2d, v9.2d[0]
#else
fmul v17.2d, v0.2d, v9.2d[0]
#endif
OP_ir v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0]
OP_ii v18.2d, v3.2d, v9.2d[0]
fmul v19.2d, v2.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v19.2d, v19.2d
eor v19.16b, v19.16b, v19.16b
fmls v19.2d, v2.2d, v9.2d[0]
#else
fmul v19.2d, v2.2d, v9.2d[0]
#endif
OP_ir v19.2d, v3.2d, v8.2d[0]
fmul v20.2d, v0.2d, v8.2d[1]
OP_ii v20.2d, v1.2d, v9.2d[1]
fmul v21.2d, v0.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v21.2d, v21.2d
eor v21.16b, v21.16b, v21.16b
fmls v21.2d, v0.2d, v9.2d[1]
#else
fmul v21.2d, v0.2d, v9.2d[1]
#endif
OP_ir v21.2d, v1.2d, v8.2d[1]
fmul v22.2d, v2.2d, v8.2d[1]
OP_ii v22.2d, v3.2d, v9.2d[1]
fmul v23.2d, v2.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v23.2d, v23.2d
eor v23.16b, v23.16b, v23.16b
fmls v23.2d, v2.2d, v9.2d[1]
#else
fmul v23.2d, v2.2d, v9.2d[1]
#endif
OP_ir v23.2d, v3.2d, v8.2d[1]
fmul v24.2d, v0.2d, v10.2d[0]
OP_ii v24.2d, v1.2d, v11.2d[0]
fmul v25.2d, v0.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v25.2d, v25.2d
eor v25.16b, v25.16b, v25.16b
fmls v25.2d, v0.2d, v11.2d[0]
#else
fmul v25.2d, v0.2d, v11.2d[0]
#endif
OP_ir v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0]
OP_ii v26.2d, v3.2d, v11.2d[0]
fmul v27.2d, v2.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v27.2d, v27.2d
eor v27.16b, v27.16b, v27.16b
fmls v27.2d, v2.2d, v11.2d[0]
#else
fmul v27.2d, v2.2d, v11.2d[0]
#endif
OP_ir v27.2d, v3.2d, v10.2d[0]
fmul v28.2d, v0.2d, v10.2d[1]
OP_ii v28.2d, v1.2d, v11.2d[1]
fmul v29.2d, v0.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v29.2d, v29.2d
eor v29.16b, v29.16b, v29.16b
fmls v29.2d, v0.2d, v11.2d[1]
#else
fmul v29.2d, v0.2d, v11.2d[1]
#endif
OP_ir v29.2d, v1.2d, v10.2d[1]
fmul v30.2d, v2.2d, v10.2d[1]
OP_ii v30.2d, v3.2d, v11.2d[1]
fmul v31.2d, v2.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
fneg v31.2d, v31.2d
eor v31.16b, v31.16b, v31.16b
fmls v31.2d, v2.2d, v11.2d[1]
#else
fmul v31.2d, v2.2d, v11.2d[1]
#endif
OP_ir v31.2d, v3.2d, v10.2d[1]