Improvements to COPY and IAMAX kernels

This commit is contained in:
Ashwin Sekhar T K 2016-07-14 13:49:15 +05:30
parent 8d86d14d3f
commit 78782485b6
3 changed files with 424 additions and 23 deletions

View File

@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
str TMPF, [Y], #SZ
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X], #8
st1 {v0.2s}, [Y], #8
ldr d0, [X], #8
str d0, [Y], #8
#else
ld1 {v0.2d}, [X], #16
st1 {v0.2d}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
#endif
#endif
.endm
.macro KERNEL_F4
#if !defined(COMPLEX)
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
st1 {v0.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
#endif
#else // COMPLEX
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
st1 {v2.4s}, [Y], #16
st1 {v3.4s}, [Y], #16
ldr q0, [X], #16
str q0, [Y], #16
ldr q1, [X], #16
str q1, [Y], #16
ldr q2, [X], #16
str q2, [Y], #16
ldr q3, [X], #16
str q3, [Y], #16
#endif
#endif

View File

@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fabs MAXF, MAXF
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ldp q2, q3, [X], #32
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fmax v2.4s, v2.4s, v3.4s
fmaxv TMPF, v2.4s
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#else
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fmax v2.2d, v2.2d, v3.2d
fmax v4.2d, v4.2d, v5.2d
fmax v2.2d, v2.2d, v4.2d
fmaxp TMPF, v2.2d
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F8_FINALIZE
sub x6, INDEX, #1
#if !defined(DOUBLE)
lsl x6, x6, #2
add x7, x7, x6
ldp q2, q3, [x7]
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
ins v4.s[0], v3.s[0]
ins v5.s[0], v3.s[1]
ins v6.s[0], v3.s[2]
ins v7.s[0], v3.s[3]
add x6, INDEX, #7
fcmp MAXF, s7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[0]
ins v5.s[0], v2.s[1]
ins v6.s[0], v2.s[2]
ins v7.s[0], v2.s[3]
sub x6, x6, #1
fcmp MAXF, s7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
#else
add x6, x6, #4
lsl x6, x6, #3
add x7, x7, x6
ldp q2, q3, [x7]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
ins v4.d[0], v2.d[0]
ins v5.d[0], v2.d[1]
ins v6.d[0], v3.d[0]
ins v7.d[0], v3.d[1]
add x6, INDEX, #7
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d4
csel INDEX, x6, INDEX, eq
sub x7, x7, #32
ldp q2, q3, [x7]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
ins v4.d[0], v2.d[0]
ins v5.d[0], v2.d[1]
ins v6.d[0], v3.d[0]
ins v7.d[0], v3.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d6
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d5
csel INDEX, x6, INDEX, eq
sub x6, x6, #1
fcmp MAXF, d4
csel INDEX, x6, INDEX, eq
#endif
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #3
cmp I, xzr
beq iamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1

View File

@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s
fabs v5.4s, v5.4s
faddp v2.4s, v2.4s, v3.4s
faddp v3.4s, v4.4s, v5.4s
fmax v2.4s, v2.4s, v3.4s
fmaxv TMPF, v2.4s
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#else
ldp q2, q3, [X], #32
ldp q4, q5, [X], #32
ldp q16, q17, [X], #32
ldp q18, q19, [X], #32
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fabs v16.2d, v16.2d
fabs v17.2d, v17.2d
fabs v18.2d, v18.2d
fabs v19.2d, v19.2d
faddp v2.2d, v2.2d, v3.2d
faddp v3.2d, v4.2d, v5.2d
faddp v4.2d, v16.2d, v17.2d
faddp v5.2d, v18.2d, v19.2d
fmax v2.2d, v2.2d, v3.2d
fmax v4.2d, v4.2d, v5.2d
fmax v2.2d, v2.2d, v4.2d
fmaxp TMPF, v2.2d
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
add Z, Z, #8
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F8_FINALIZE
sub x6, INDEX, #1
#if !defined(DOUBLE)
lsl x6, x6, #3
add x7, x7, x6
ldp q2, q3, [x7]
ldp q4, q5, [x7, #32]
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s
fabs v5.4s, v5.4s
faddp v2.4s, v2.4s, v3.4s
faddp v3.4s, v4.4s, v5.4s
ins v4.s[0], v3.s[3]
add x6, INDEX, #7
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[2]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[1]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v3.s[0]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[3]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[2]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[1]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
ins v4.s[0], v2.s[0]
sub x6, x6, #1
fcmp MAXF, s4
csel INDEX, x6, INDEX, eq
#else
lsl x6, x6, #4
add x7, x7, x6
ldp q2, q3, [x7]
ldp q4, q5, [x7, #32]
ldp q16, q17, [x7, #64]
ldp q18, q19, [x7, #96]
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
fabs v16.2d, v16.2d
fabs v17.2d, v17.2d
fabs v18.2d, v18.2d
fabs v19.2d, v19.2d
faddp v2.2d, v2.2d, v3.2d
faddp v3.2d, v4.2d, v5.2d
faddp v4.2d, v16.2d, v17.2d
faddp v5.2d, v18.2d, v19.2d
ins v7.d[0], v5.d[1]
add x6, INDEX, #7
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v5.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v4.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v4.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v3.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v3.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v2.d[1]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
ins v7.d[0], v2.d[0]
sub x6, x6, #1
fcmp MAXF, d7
csel INDEX, x6, INDEX, eq
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp INC_X, xzr
ble iamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #3
cmp I, xzr
ble iamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1