From 78782485b6f859d72be854ba6c2a0ec52d137adb Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:49:15 +0530 Subject: [PATCH] Improvements to COPY and IAMAX kernels --- kernel/arm64/copy.S | 46 ++++----- kernel/arm64/iamax.S | 184 +++++++++++++++++++++++++++++++++++ kernel/arm64/izamax.S | 217 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 424 insertions(+), 23 deletions(-) diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S index 17aa5a1e8..70eab96fb 100644 --- a/kernel/arm64/copy.S +++ b/kernel/arm64/copy.S @@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. str TMPF, [Y], #SZ #else #if !defined(DOUBLE) - ld1 {v0.2s}, [X], #8 - st1 {v0.2s}, [Y], #8 + ldr d0, [X], #8 + str d0, [Y], #8 #else - ld1 {v0.2d}, [X], #16 - st1 {v0.2d}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #endif #endif .endm .macro KERNEL_F4 - #if !defined(COMPLEX) #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + #endif #else // COMPLEX #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - ld1 {v2.4s}, [X], #16 - ld1 {v3.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 - st1 {v2.4s}, [Y], #16 - st1 {v3.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + ldr q2, [X], #16 + str q2, [Y], #16 + ldr q3, [X], #16 + str q3, [Y], #16 #endif #endif diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S index 575c15e53..6c0d84f98 100644 --- a/kernel/arm64/iamax.S +++ b/kernel/arm64/iamax.S @@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fabs MAXF, MAXF .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #2 + add x7, x7, x6 + ldp q2, q3, [x7] + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + + ins v4.s[0], v3.s[0] + ins v5.s[0], v3.s[1] + ins v6.s[0], v3.s[2] + ins v7.s[0], v3.s[3] + + add x6, INDEX, #7 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + ins v5.s[0], v2.s[1] + ins v6.s[0], v2.s[2] + ins v7.s[0], v2.s[3] + + sub x6, x6, #1 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + add x6, x6, #4 + lsl x6, x6, #3 + add x7, x7, x6 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq + + sub x7, x7, #32 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq +#endif +.endm + + .macro KERNEL_S1 ld1 TMPVF, [X], INC_X add Z, Z, #1 @@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + beq iamax_kernel_F1 + + add Z, Z, #1 +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1 diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S index ebdc671e0..9b252ec98 100644 --- a/kernel/arm64/izamax.S +++ b/kernel/arm64/izamax.S @@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + ldp q16, q17, [X], #32 + ldp q18, q19, [X], #32 + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #3 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + ins v4.s[0], v3.s[3] + add x6, INDEX, #7 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[3] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + lsl x6, x6, #4 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + ldp q16, q17, [x7, #64] + ldp q18, q19, [x7, #96] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + ins v7.d[0], v5.d[1] + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v5.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq +#endif +.endm + .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v1.2s}, [X], INC_X @@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + ble iamax_kernel_F1 + + add Z, Z, #1 + +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1