From 3eaff85191b500734b47ff08f2649df9ca1278d1 Mon Sep 17 00:00:00 2001 From: kaustubh Date: Tue, 13 Dec 2016 11:41:17 +0530 Subject: [PATCH 1/3] Updated data prefetch in TRSM, ASUM, DOT functions Signed-off-by: kaustubh --- kernel/mips/casum_msa.c | 101 ++++++--- kernel/mips/cdot_msa.c | 182 ++++++++++------ kernel/mips/dasum_msa.c | 101 ++++++--- kernel/mips/ddot_msa.c | 43 ++-- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 143 +++++-------- kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 142 +++++-------- kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 112 ++++------ kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 114 ++++------ kernel/mips/macros_msa.h | 17 ++ kernel/mips/sasum_msa.c | 74 ++++--- kernel/mips/sdot_msa.c | 43 ++-- kernel/mips/strsm_kernel_LN_8x8_msa.c | 33 +-- kernel/mips/strsm_kernel_LT_8x8_msa.c | 28 ++- kernel/mips/strsm_kernel_RN_8x8_msa.c | 25 ++- kernel/mips/strsm_kernel_RT_8x8_msa.c | 31 +-- kernel/mips/zasum_msa.c | 101 ++++++--- kernel/mips/zdot_msa.c | 291 ++++++++++++++++++-------- 17 files changed, 934 insertions(+), 647 deletions(-) diff --git a/kernel/mips/casum_msa.c b/kernel/mips/casum_msa.c index 5bb948392..6509cb2fc 100644 --- a/kernel/mips/casum_msa.c +++ b/kernel/mips/casum_msa.c @@ -47,39 +47,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (1 == inc_x) { -#ifdef ENABLE_PREFETCH - FLOAT *x_pref; - BLASLONG pref_offset; - - pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) + if (n > 31) { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(FLOAT); - x_pref = x + pref_offset + 128; -#endif + FLOAT *x_pref; + BLASLONG pref_offset; - for (i = (n >> 5); i--;) - { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 0(%[x_pref])\n\t" - "pref 0, 32(%[x_pref])\n\t" - "pref 0, 64(%[x_pref])\n\t" - "pref 0, 96(%[x_pref])\n\t" - "pref 0, 128(%[x_pref])\n\t" - "pref 0, 160(%[x_pref])\n\t" - "pref 0, 192(%[x_pref])\n\t" - "pref 0, 224(%[x_pref])\n\t" - - : : [x_pref] "r" (x_pref) - ); - - x_pref += 64; -#endif + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 128 + 32; LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + for (i = (n >> 5) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src8); + sum_abs1 += AND_VEC_W(src9); + sum_abs2 += AND_VEC_W(src10); + sum_abs3 += AND_VEC_W(src11); + sum_abs0 += AND_VEC_W(src12); + sum_abs1 += AND_VEC_W(src13); + sum_abs2 += AND_VEC_W(src14); + sum_abs3 += AND_VEC_W(src15); + } + LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); @@ -159,9 +175,34 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { inc_x2 = 2 * inc_x; - for (i = (n >> 4); i--;) + if (n > 16) { LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); + for (i = (n >> 4) - 1; i--;) + { + LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + + LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src8); + sum_abs1 += AND_VEC_W(src9); + sum_abs2 += AND_VEC_W(src10); + sum_abs3 += AND_VEC_W(src11); + sum_abs0 += AND_VEC_W(src12); + sum_abs1 += AND_VEC_W(src13); + sum_abs2 += AND_VEC_W(src14); + sum_abs3 += AND_VEC_W(src15); + } + LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); diff --git a/kernel/mips/cdot_msa.c b/kernel/mips/cdot_msa.c index 2079c9e76..cb86e5fbd 100644 --- a/kernel/mips/cdot_msa.c +++ b/kernel/mips/cdot_msa.c @@ -45,10 +45,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA BLASLONG i = 0; FLOAT dot[2]; BLASLONG inc_x2, inc_y2; - FLOAT x0, x1, x2, x3, x4, x5, x6, x7; - FLOAT y0, y1, y2, y3, y4, y5, y6, y7; - v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; - v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7, vx8, vx9, vx10, vx11; + v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7, vy8, vy9, vy10, vy11; v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; v4f32 dot0 = {0, 0, 0, 0}; @@ -71,78 +70,135 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((1 == inc_x) && (1 == inc_y)) { -#ifdef ENABLE_PREFETCH - FLOAT *x_pref, *y_pref; - BLASLONG pref_offset; - - pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) + if (n > 15) { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(FLOAT); - x_pref = x + pref_offset + 64; + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; - pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) - { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(FLOAT); - y_pref = y + pref_offset + 64; -#endif + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 16; - for (i = (n >> 4); i--;) - { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 0(%[x_pref])\n\t" - "pref 0, 32(%[x_pref])\n\t" - "pref 0, 64(%[x_pref])\n\t" - "pref 0, 96(%[x_pref])\n\t" - "pref 0, 0(%[y_pref])\n\t" - "pref 0, 32(%[y_pref])\n\t" - "pref 0, 64(%[y_pref])\n\t" - "pref 0, 96(%[y_pref])\n\t" + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 64 + 16; - : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref) - ); - - x_pref += 32; - y_pref += 32; -#endif - - LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); - LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); - PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); - PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); - PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); - PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); - PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); - PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); - PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); + for (i = (n >> 4) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 32; + y_pref += 32; + + vx4 = LD_SP(x); x += 4; + vx1r = (v4f32) __msa_pckev_w((v4i32) vx3, (v4i32) vx2); + dot0 += (vx0r * vy0r); + vx5 = LD_SP(x); x += 4; + vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2); + dot1 += (vx0i * vy0r); + vy4 = LD_SP(y); y += 4; + vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2); + dot2 += (vx1r * vy1r); + vy5 = LD_SP(y); y += 4; + vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2); + dot3 += (vx1i * vy1r); + vx6 = LD_SP(x); x += 4; + vx7 = LD_SP(x); x += 4; + vy6 = LD_SP(y); y += 4; + vy7 = LD_SP(y); y += 4; + vx8 = LD_SP(x); x += 4; + dot0 -= (vx0i * vy0i); + vx9 = LD_SP(x); x += 4; + vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4); + dot1 += (vx0r * vy0i); + vy8 = LD_SP(y); y += 4; + vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4); + dot2 -= (vx1i * vy1i); + vy9 = LD_SP(y); y += 4; + vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4); + dot3 += (vx1r * vy1i); + vx10 = LD_SP(x); x += 4; + vy2i = (v4f32) __msa_pckod_w((v4i32) vy5, (v4i32) vy4); + vx11 = LD_SP(x); x += 4; + vx3r = (v4f32) __msa_pckev_w((v4i32) vx7, (v4i32) vx6); + dot4 += (vx2r * vy2r); + vy10 = LD_SP(y); y += 4; + vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6); + dot5 += (vx2i * vy2r); + vy11 = LD_SP(y); y += 4; + vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6); + vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6); + dot6 += (vx3r * vy3r); + vx0r = (v4f32) __msa_pckev_w((v4i32) vx9, (v4i32) vx8); + dot7 += (vx3i * vy3r); + vx0i = (v4f32) __msa_pckod_w((v4i32) vx9, (v4i32) vx8); + vy0r = (v4f32) __msa_pckev_w((v4i32) vy9, (v4i32) vy8); + vx2 = vx10; + vy0i = (v4f32) __msa_pckod_w((v4i32) vy9, (v4i32) vy8); + vx3 = vx11; + dot4 -= (vx2i * vy2i); + vy2 = vy10; + dot5 += (vx2r * vy2i); + vy3 = vy11; + dot6 -= (vx3i * vy3i); + dot7 += (vx3r * vy3i); + } + + vx4 = LD_SP(x); x += 4; + vx1r = (v4f32) __msa_pckev_w((v4i32) vx3, (v4i32) vx2); dot0 += (vx0r * vy0r); - dot0 OP1 (vx0i * vy0i); - dot1 OP2 (vx0i * vy0r); - dot1 += (vx0r * vy0i); - + vx5 = LD_SP(x); x += 4; + vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2); + dot1 += (vx0i * vy0r); + vy4 = LD_SP(y); y += 4; + vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2); dot2 += (vx1r * vy1r); - dot2 OP1 (vx1i * vy1i); - dot3 OP2 (vx1i * vy1r); + vy5 = LD_SP(y); y += 4; + vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2); + dot3 += (vx1i * vy1r); + vx6 = LD_SP(x); x += 4; + vx7 = LD_SP(x); x += 4; + vy6 = LD_SP(y); y += 4; + vy7 = LD_SP(y); y += 4; + dot0 -= (vx0i * vy0i); + vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4); + dot1 += (vx0r * vy0i); + vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4); + dot2 -= (vx1i * vy1i); + vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4); dot3 += (vx1r * vy1i); - + vy2i = (v4f32) __msa_pckod_w((v4i32) vy5, (v4i32) vy4); + vx3r = (v4f32) __msa_pckev_w((v4i32) vx7, (v4i32) vx6); dot4 += (vx2r * vy2r); - dot4 OP1 (vx2i * vy2i); - dot5 OP2 (vx2i * vy2r); - dot5 += (vx2r * vy2i); - + vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6); + dot5 += (vx2i * vy2r); + vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6); + vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6); dot6 += (vx3r * vy3r); - dot6 OP1 (vx3i * vy3i); - dot7 OP2 (vx3i * vy3r); + dot7 += (vx3i * vy3r); + dot4 -= (vx2i * vy2i); + dot5 += (vx2r * vy2i); + dot6 -= (vx3i * vy3i); dot7 += (vx3r * vy3i); } diff --git a/kernel/mips/dasum_msa.c b/kernel/mips/dasum_msa.c index 1128d63eb..2cb37fcce 100644 --- a/kernel/mips/dasum_msa.c +++ b/kernel/mips/dasum_msa.c @@ -47,39 +47,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (1 == inc_x) { -#ifdef ENABLE_PREFETCH - FLOAT *x_pref; - BLASLONG pref_offset; - - pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) + if (n > 31) { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(FLOAT); - x_pref = x + pref_offset + 64; -#endif + FLOAT *x_pref; + BLASLONG pref_offset; - for (i = (n >> 5); i--;) - { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 0(%[x_pref])\n\t" - "pref 0, 32(%[x_pref])\n\t" - "pref 0, 64(%[x_pref])\n\t" - "pref 0, 96(%[x_pref])\n\t" - "pref 0, 128(%[x_pref])\n\t" - "pref 0, 160(%[x_pref])\n\t" - "pref 0, 192(%[x_pref])\n\t" - "pref 0, 224(%[x_pref])\n\t" - - : : [x_pref] "r" (x_pref) - ); - - x_pref += 32; -#endif + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 16; LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + for (i = (n >> 5) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); + } + LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); @@ -153,9 +169,34 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - for (i = (n >> 4); i--;) + if (n > 16) { LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + for (i = (n >> 4) - 1; i--;) + { + LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); + } + LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); diff --git a/kernel/mips/ddot_msa.c b/kernel/mips/ddot_msa.c index b92f3132a..9136e2115 100644 --- a/kernel/mips/ddot_msa.c +++ b/kernel/mips/ddot_msa.c @@ -44,25 +44,40 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ((1 == inc_x) && (1 == inc_y)) { + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 32; + for (i = (n >> 4); i--;) { LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 256(%[x])\n\t" - "pref 0, 288(%[x])\n\t" - "pref 0, 320(%[x])\n\t" - "pref 0, 352(%[x])\n\t" - "pref 0, 256(%[y])\n\t" - "pref 0, 288(%[y])\n\t" - "pref 0, 320(%[y])\n\t" - "pref 0, 352(%[y])\n\t" - - : : [x] "r" (x), [y] "r" (y) - ); -#endif + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 16; + y_pref += 16; dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 0169ff814..9fb5141ca 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -45,25 +45,18 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, -96(%[a]) \n\t" - "pref 0, -32(%[a]) \n\t" - "pref 0, -160(%[a]) \n\t" - "pref 0, -224(%[a]) \n\t" - "pref 0, -64(%[a]) \n\t" - "pref 0, -128(%[a]) \n\t" - "pref 0, -192(%[a]) \n\t" - "pref 0, -256(%[a]) \n\t" - "pref 0, -320(%[a]) \n\t" - "pref 0, -384(%[a]) \n\t" - "pref 0, -448(%[a]) \n\t" - "pref 0, -512(%[a]) \n\t" - - : - : [a] "r"(a) - ); -#endif + PREF_OFFSET(a, -96); + PREF_OFFSET(a, -32); + PREF_OFFSET(a, -160); + PREF_OFFSET(a, -224); + PREF_OFFSET(a, -64); + PREF_OFFSET(a, -128); + PREF_OFFSET(a, -192); + PREF_OFFSET(a, -256); + PREF_OFFSET(a, -320); + PREF_OFFSET(a, -384); + PREF_OFFSET(a, -448); + PREF_OFFSET(a, -512); LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); @@ -72,29 +65,29 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) if (bk > 0) { - BLASLONG i; - FLOAT *pba = a, *pbb = b; - v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + BLASLONG i, pref_offset; + FLOAT *pba = a, *pbb = b, *pa0_pref; + v2f64 src_b, src_b0, src_b1; - LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(pbb, 2, src_b0, src_b1); + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); - for (i = (bk - 1) >> 1; i--;) + if (pref_offset) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 128(%[pba]) \n\t" - "pref 0, 160(%[pba]) \n\t" - "pref 0, 192(%[pba]) \n\t" - "pref 0, 224(%[pba]) \n\t" + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } - : - : [pba] "r"(pba) - ); -#endif + pa0_pref = a + pref_offset; - LD_DP4_INC(pba, 2, src_a8, src_a9, src_a16, src_a17); - LD_DP2_INC(pbb, 2, src_b2, src_b3); + for (i = bk >> 1; i--;) + { + PREF_OFFSET(pa0_pref, 128); + PREF_OFFSET(pa0_pref, 160); + PREF_OFFSET(pa0_pref, 192); + PREF_OFFSET(pa0_pref, 224); + + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -123,33 +116,6 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); - src_c0 -= src_a8 * src_b; - src_c1 -= src_a9 * src_b; - src_c2 -= src_a16 * src_b; - src_c3 -= src_a17 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); - src_c4 -= src_a8 * src_b; - src_c5 -= src_a9 * src_b; - src_c6 -= src_a16 * src_b; - src_c7 -= src_a17 * src_b; - - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); - src_c8 -= src_a8 * src_b; - src_c9 -= src_a9 * src_b; - src_c10 -= src_a16 * src_b; - src_c11 -= src_a17 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); - src_c12 -= src_a8 * src_b; - src_c13 -= src_a9 * src_b; - src_c14 -= src_a16 * src_b; - src_c15 -= src_a17 * src_b; - } - - if ((bk - 1) & 1) - { src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; @@ -174,33 +140,38 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(pbb, 2, src_b0, src_b1); + pa0_pref += 16; } - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); - src_c0 -= src_a0 * src_b; - src_c1 -= src_a1 * src_b; - src_c2 -= src_a2 * src_b; - src_c3 -= src_a3 * src_b; + if (bk & 1) + { + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); - src_c4 -= src_a0 * src_b; - src_c5 -= src_a1 * src_b; - src_c6 -= src_a2 * src_b; - src_c7 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); - src_c8 -= src_a0 * src_b; - src_c9 -= src_a1 * src_b; - src_c10 -= src_a2 * src_b; - src_c11 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); - src_c12 -= src_a0 * src_b; - src_c13 -= src_a1 * src_b; - src_c14 -= src_a2 * src_b; - src_c15 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } } a -= 64; diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 6406988e5..525fc8585 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -44,27 +44,20 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; -#ifdef ENABLE_PREFETCH a += bk * 8; - __asm__ __volatile__( - "pref 0, (%[a]) \n\t" - "pref 0, 32(%[a]) \n\t" - "pref 0, 72(%[a]) \n\t" - "pref 0, 104(%[a]) \n\t" - "pref 0, 144(%[a]) \n\t" - "pref 0, 176(%[a]) \n\t" - "pref 0, 216(%[a]) \n\t" - "pref 0, 248(%[a]) \n\t" - "pref 0, 288(%[a]) \n\t" - "pref 0, 360(%[a]) \n\t" - "pref 0, 504(%[a]) \n\t" - "pref 0, 432(%[a]) \n\t" - - : - : [a] "r"(a) - ); + PREF_OFFSET(a, 0); + PREF_OFFSET(a, 32); + PREF_OFFSET(a, 72); + PREF_OFFSET(a, 104); + PREF_OFFSET(a, 144); + PREF_OFFSET(a, 176); + PREF_OFFSET(a, 216); + PREF_OFFSET(a, 248); + PREF_OFFSET(a, 288); + PREF_OFFSET(a, 360); + PREF_OFFSET(a, 504); + PREF_OFFSET(a, 432); a -= bk * 8; -#endif LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); @@ -73,28 +66,29 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) if (bk) { - BLASLONG i; - v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + BLASLONG i, pref_offset; + FLOAT *pa0_pref; + v2f64 src_b, src_b0, src_b1; - LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(b, 2, src_b0, src_b1); + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); - for (i = ((bk - 1) >> 1); i--;) + if (pref_offset) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 128(%[a]) \n\t" - "pref 0, 160(%[a]) \n\t" - "pref 0, 192(%[a]) \n\t" - "pref 0, 224(%[a]) \n\t" + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } - : - : [a] "r"(a) - ); -#endif + pa0_pref = a + pref_offset; - LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7); - LD_DP2_INC(b, 2, src_b2, src_b3); + for (i = (bk >> 1); i--;) + { + PREF_OFFSET(pa0_pref, 128); + PREF_OFFSET(pa0_pref, 160); + PREF_OFFSET(pa0_pref, 192); + PREF_OFFSET(pa0_pref, 224); + + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -123,33 +117,6 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); - src_c0 -= src_a4 * src_b; - src_c1 -= src_a5 * src_b; - src_c2 -= src_a6 * src_b; - src_c3 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); - src_c4 -= src_a4 * src_b; - src_c5 -= src_a5 * src_b; - src_c6 -= src_a6 * src_b; - src_c7 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); - src_c8 -= src_a4 * src_b; - src_c9 -= src_a5 * src_b; - src_c10 -= src_a6 * src_b; - src_c11 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); - src_c12 -= src_a4 * src_b; - src_c13 -= src_a5 * src_b; - src_c14 -= src_a6 * src_b; - src_c15 -= src_a7 * src_b; - } - - if ((bk - 1) & 1) - { src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; @@ -174,33 +141,38 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(b, 2, src_b0, src_b1); + pa0_pref += 16; } - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); - src_c0 -= src_a0 * src_b; - src_c1 -= src_a1 * src_b; - src_c2 -= src_a2 * src_b; - src_c3 -= src_a3 * src_b; + if (bk & 1) + { + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); - src_c4 -= src_a0 * src_b; - src_c5 -= src_a1 * src_b; - src_c6 -= src_a2 * src_b; - src_c7 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); - src_c8 -= src_a0 * src_b; - src_c9 -= src_a1 * src_b; - src_c10 -= src_a2 * src_b; - src_c11 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); - src_c12 -= src_a0 * src_b; - src_c13 -= src_a1 * src_b; - src_c14 -= src_a2 * src_b; - src_c15 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } } ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index e5343ce23..cb361c511 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -46,29 +46,29 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) if (bk) { - BLASLONG i; - v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; - v2f64 src_b; + BLASLONG i, pref_offset; + FLOAT *pa0_pref; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b; - LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(b, 2, src_b0, src_b1); + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); - for (i = ((bk - 1) >> 1); i--;) + if (pref_offset) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 128(%[a]) \n\t" - "pref 0, 160(%[a]) \n\t" - "pref 0, 192(%[a]) \n\t" - "pref 0, 224(%[a]) \n\t" + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } - : - : [a] "r"(a) - ); -#endif + pa0_pref = a + pref_offset; - LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7); - LD_DP2_INC(b, 2, src_b2, src_b3); + for (i = (bk >> 1); i--;) + { + PREF_OFFSET(pa0_pref, 128); + PREF_OFFSET(pa0_pref, 160); + PREF_OFFSET(pa0_pref, 192); + PREF_OFFSET(pa0_pref, 224); + + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -97,33 +97,6 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); - src_c0 -= src_a4 * src_b; - src_c1 -= src_a5 * src_b; - src_c2 -= src_a6 * src_b; - src_c3 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); - src_c4 -= src_a4 * src_b; - src_c5 -= src_a5 * src_b; - src_c6 -= src_a6 * src_b; - src_c7 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); - src_c8 -= src_a4 * src_b; - src_c9 -= src_a5 * src_b; - src_c10 -= src_a6 * src_b; - src_c11 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); - src_c12 -= src_a4 * src_b; - src_c13 -= src_a5 * src_b; - src_c14 -= src_a6 * src_b; - src_c15 -= src_a7 * src_b; - } - - if ((bk - 1) & 1) - { src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; @@ -148,33 +121,38 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(b, 2, src_b0, src_b1); + pa0_pref += 16; } - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); - src_c0 -= src_a0 * src_b; - src_c1 -= src_a1 * src_b; - src_c2 -= src_a2 * src_b; - src_c3 -= src_a3 * src_b; + if (bk & 1) + { + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); - src_c4 -= src_a0 * src_b; - src_c5 -= src_a1 * src_b; - src_c6 -= src_a2 * src_b; - src_c7 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); - src_c8 -= src_a0 * src_b; - src_c9 -= src_a1 * src_b; - src_c10 -= src_a2 * src_b; - src_c11 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); - src_c12 -= src_a0 * src_b; - src_c13 -= src_a1 * src_b; - src_c14 -= src_a2 * src_b; - src_c15 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } } src_b0 = LD_DP(b + 0); diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 837ba3e5b..581a90f71 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -46,29 +46,29 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) if (bk > 0) { - BLASLONG i; - FLOAT *pba = a, *pbb = b; - v2f64 src_b, src_b0, src_b1, src_b2, src_b3; - v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + BLASLONG i, pref_offset; + FLOAT *pba = a, *pbb = b, *pa0_pref; + v2f64 src_b, src_b0, src_b1, src_a0, src_a1, src_a2, src_a3; - LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(pbb, 2, src_b0, src_b1); + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); - for (i = ((bk - 1) >> 1); i--;) + if (pref_offset) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 128(%[pba]) \n\t" - "pref 0, 160(%[pba]) \n\t" - "pref 0, 192(%[pba]) \n\t" - "pref 0, 224(%[pba]) \n\t" + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } - : - : [pba] "r"(pba) - ); -#endif - LD_DP4_INC(pba, 2, src_a4, src_a5, src_a6, src_a7); - LD_DP2_INC(pbb, 2, src_b2, src_b3); + pa0_pref = a + pref_offset; + + for (i = (bk >> 1); i--;) + { + PREF_OFFSET(pa0_pref, 128); + PREF_OFFSET(pa0_pref, 160); + PREF_OFFSET(pa0_pref, 192); + PREF_OFFSET(pa0_pref, 224); + + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -97,33 +97,6 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); - src_c0 -= src_a4 * src_b; - src_c1 -= src_a5 * src_b; - src_c2 -= src_a6 * src_b; - src_c3 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); - src_c4 -= src_a4 * src_b; - src_c5 -= src_a5 * src_b; - src_c6 -= src_a6 * src_b; - src_c7 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); - src_c8 -= src_a4 * src_b; - src_c9 -= src_a5 * src_b; - src_c10 -= src_a6 * src_b; - src_c11 -= src_a7 * src_b; - - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); - src_c12 -= src_a4 * src_b; - src_c13 -= src_a5 * src_b; - src_c14 -= src_a6 * src_b; - src_c15 -= src_a7 * src_b; - } - - if ((bk - 1) & 1) - { src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; @@ -148,33 +121,38 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2_INC(pbb, 2, src_b0, src_b1); + pa0_pref += 16; } - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); - src_c0 -= src_a0 * src_b; - src_c1 -= src_a1 * src_b; - src_c2 -= src_a2 * src_b; - src_c3 -= src_a3 * src_b; + if (bk & 1) + { + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); - src_c4 -= src_a0 * src_b; - src_c5 -= src_a1 * src_b; - src_c6 -= src_a2 * src_b; - src_c7 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); - src_c8 -= src_a0 * src_b; - src_c9 -= src_a1 * src_b; - src_c10 -= src_a2 * src_b; - src_c11 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; - src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); - src_c12 -= src_a0 * src_b; - src_c13 -= src_a1 * src_b; - src_c14 -= src_a2 * src_b; - src_c15 -= src_a3 * src_b; + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } } a -= 32; diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index 5ef685278..e7a0845cc 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -32,6 +32,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ENABLE_PREFETCH +#ifdef ENABLE_PREFETCH +inline static void prefetch_load_lf(unsigned char *src) +{ + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r" (src)); +} + +#define PREFETCH(PTR) prefetch_load_lf((unsigned char *)(PTR)); + +#define STRNG(X) #X +#define PREF_OFFSET(src_ptr, offset) \ + __asm__ __volatile__("pref 0, " STRNG(offset) "(%[src]) \n\t" : : [src] "r" (src_ptr)); + +#else +#define PREFETCH(PTR) +#define PREF_OFFSET(src_ptr, offset) +#endif + #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) #define LD_SP(...) LD_W(v4f32, __VA_ARGS__) diff --git a/kernel/mips/sasum_msa.c b/kernel/mips/sasum_msa.c index e15332f85..b38385d76 100644 --- a/kernel/mips/sasum_msa.c +++ b/kernel/mips/sasum_msa.c @@ -48,39 +48,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (1 == inc_x) { -#ifdef ENABLE_PREFETCH - FLOAT *x_pref; - BLASLONG pref_offset; - - pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) + if (n > 63) { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(FLOAT); - x_pref = x + pref_offset + 128; -#endif + FLOAT *x_pref; + BLASLONG pref_offset; - for (i = 0; i < (n >> 6); i++) - { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 0(%[x_pref])\n\t" - "pref 0, 32(%[x_pref])\n\t" - "pref 0, 64(%[x_pref])\n\t" - "pref 0, 96(%[x_pref])\n\t" - "pref 0, 128(%[x_pref])\n\t" - "pref 0, 160(%[x_pref])\n\t" - "pref 0, 192(%[x_pref])\n\t" - "pref 0, 224(%[x_pref])\n\t" - - : : [x_pref] "r" (x_pref) - ); - - x_pref += 64; -#endif + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 128 + 32; LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + for (i = 0; i < (n >> 6) - 1; i++) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); + sum_abs0 += AND_VEC_W(src4); + sum_abs1 += AND_VEC_W(src5); + sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); + + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_W(src8); + sum_abs1 += AND_VEC_W(src9); + sum_abs2 += AND_VEC_W(src10); + sum_abs3 += AND_VEC_W(src11); + sum_abs0 += AND_VEC_W(src12); + sum_abs1 += AND_VEC_W(src13); + sum_abs2 += AND_VEC_W(src14); + sum_abs3 += AND_VEC_W(src15); + } + LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c index f281db349..e02e10c61 100644 --- a/kernel/mips/sdot_msa.c +++ b/kernel/mips/sdot_msa.c @@ -48,25 +48,40 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ((1 == inc_x) && (1 == inc_y)) { + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 64; + for (i = (n >> 5); i--;) { LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 256(%[x])\n\t" - "pref 0, 288(%[x])\n\t" - "pref 0, 320(%[x])\n\t" - "pref 0, 352(%[x])\n\t" - "pref 0, 256(%[y])\n\t" - "pref 0, 288(%[y])\n\t" - "pref 0, 320(%[y])\n\t" - "pref 0, 352(%[y])\n\t" - - : : [x] "r" (x), [y] "r" (y) - ); -#endif + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 32; + y_pref += 32; dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c index 1974a384b..56a3398cb 100644 --- a/kernel/mips/strsm_kernel_LN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -58,21 +58,24 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_bb0, src_bb1, src_b0, src_b1, src_b2, src_b3, src_a1; + BLASLONG k, pref_offset; + FLOAT *aa = a, *bb = b, *pa0_pref; + v4f32 src_a1, src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1; + + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); + + if (pref_offset) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + + pa0_pref = a + pref_offset; for (k = 0; k < (bk >> 1); k++) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 64(%[aa]) \n\t" - "pref 0, 96(%[aa]) \n\t" - - : - : [aa] "r" (aa) - ); -#endif + PREF_OFFSET(pa0_pref, 64); + PREF_OFFSET(pa0_pref, 96); LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); @@ -119,12 +122,14 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; + + pa0_pref += 16; } if (bk & 1) { - LD_SP2(aa, 4, src_a0, src_a1); - LD_SP2(bb, 4, src_bb0, src_bb1); + LD_SP2_INC(aa, 4, src_a0, src_a1); + LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c index 547a55fe8..a666915bb 100644 --- a/kernel/mips/strsm_kernel_LT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -58,20 +58,24 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1; + BLASLONG k, pref_offset; + FLOAT *pa0_pref; + v4f32 src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1; + + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); + + if (pref_offset) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + + pa0_pref = a + pref_offset; for (k = 0; k < (bk >> 1); k++) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 64(%[a]) \n\t" - "pref 0, 96(%[a]) \n\t" - - : - : [a] "r" (a) - ); -#endif + PREF_OFFSET(pa0_pref, 64); + PREF_OFFSET(pa0_pref, 96); LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); @@ -118,6 +122,8 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; + + pa0_pref += 16; } if (bk & 1) diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c index d1d02d5e3..ee3894889 100644 --- a/kernel/mips/strsm_kernel_RN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -56,20 +56,25 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { - BLASLONG k; + BLASLONG k, pref_offset; + FLOAT *pa0_pref; v4f32 src_a0, src_a1, src_bb0, src_bb1; + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); + + if (pref_offset) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + + pa0_pref = a + pref_offset; + for (k = 0; k < (bk >> 1); k++) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 64(%[a]) \n\t" - "pref 0, 96(%[a]) \n\t" + PREF_OFFSET(pa0_pref, 64); + PREF_OFFSET(pa0_pref, 96); - : - : [a] "r" (a) - ); -#endif LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); @@ -115,6 +120,8 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; + + pa0_pref += 16; } if (bk & 1) diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c index 6d3904660..57438f7c7 100644 --- a/kernel/mips/strsm_kernel_RT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -56,21 +56,24 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO if (bk > 0) { - BLASLONG k; - FLOAT *aa = a, *bb = b; + BLASLONG k, pref_offset; + FLOAT *aa = a, *bb = b, *pa0_pref; v4f32 src_a0, src_a1, src_b1, src_b2, src_b3, src_bb0, src_bb1; + pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); + + if (pref_offset) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + + pa0_pref = a + pref_offset; + for (k = 0; k < (bk >> 1); k++) { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 64(%[aa]) \n\t" - "pref 0, 96(%[aa]) \n\t" - - : - : [aa] "r" (aa) - ); -#endif + PREF_OFFSET(pa0_pref, 64); + PREF_OFFSET(pa0_pref, 96); LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); @@ -117,12 +120,14 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; + + pa0_pref += 16; } if (bk & 1) { - LD_SP2(aa, 4, src_a0, src_a1); - LD_SP2(bb, 4, src_bb0, src_bb1); + LD_SP2_INC(aa, 4, src_a0, src_a1); + LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; diff --git a/kernel/mips/zasum_msa.c b/kernel/mips/zasum_msa.c index 8c4f8d175..6a2624424 100644 --- a/kernel/mips/zasum_msa.c +++ b/kernel/mips/zasum_msa.c @@ -47,39 +47,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (1 == inc_x) { -#ifdef ENABLE_PREFETCH - FLOAT *x_pref; - BLASLONG pref_offset; - - pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) + if (n > 16) { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(FLOAT); - x_pref = x + pref_offset + 64; -#endif + FLOAT *x_pref; + BLASLONG pref_offset; - for (i = (n >> 4); i--;) - { -#ifdef ENABLE_PREFETCH - __asm__ __volatile__( - "pref 0, 0(%[x_pref])\n\t" - "pref 0, 32(%[x_pref])\n\t" - "pref 0, 64(%[x_pref])\n\t" - "pref 0, 96(%[x_pref])\n\t" - "pref 0, 128(%[x_pref])\n\t" - "pref 0, 160(%[x_pref])\n\t" - "pref 0, 192(%[x_pref])\n\t" - "pref 0, 224(%[x_pref])\n\t" - - : : [x_pref] "r" (x_pref) - ); - - x_pref += 32; -#endif + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 16; LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + for (i = (n >> 4) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); + } + LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); @@ -149,9 +165,34 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { inc_x *= 2; - for (i = (n >> 4); i--;) + if (n > 16) { LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + for (i = (n >> 4) - 1; i--;) + { + LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); + } + LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); diff --git a/kernel/mips/zdot_msa.c b/kernel/mips/zdot_msa.c index f3c1847b4..33012fcf1 100644 --- a/kernel/mips/zdot_msa.c +++ b/kernel/mips/zdot_msa.c @@ -45,8 +45,8 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA BLASLONG i = 0; FLOAT dot[2]; BLASLONG inc_x2, inc_y2; - v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; - v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; + v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7, vx8, vx9, vx10, vx11; + v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7, vy8, vy9, vy10, vy11; v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; v2f64 dot0 = {0, 0}; @@ -71,116 +71,239 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; - -#ifdef ENABLE_PREFETCH if ((1 == inc_x) && (1 == inc_y)) { - double *x_pref, *y_pref; - BLASLONG pref_offset; - - pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) + if (n > 7) { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(double); - x_pref = x + pref_offset + 32; + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; - pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); - if (pref_offset > 0) - { - pref_offset = L1_DATA_LINESIZE - pref_offset; - } - pref_offset = pref_offset / sizeof(double); - y_pref = y + pref_offset + 32; + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32 + 8; - for (i = (n >> 3); i--;) - { - __asm__ __volatile__( - "pref 0, 0(%[x_pref])\n\t" - "pref 0, 32(%[x_pref])\n\t" - "pref 0, 64(%[x_pref])\n\t" - "pref 0, 96(%[x_pref])\n\t" - "pref 0, 0(%[y_pref])\n\t" - "pref 0, 32(%[y_pref])\n\t" - "pref 0, 64(%[y_pref])\n\t" - "pref 0, 96(%[y_pref])\n\t" + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 32 + 8; - : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref) - ); - - x_pref += 16; - y_pref += 16; - - LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); - LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); - PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); - PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); - PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); - PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); - PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); - PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); - PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); + for (i = (n >> 3) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 16; + y_pref += 16; + + vx4 = LD_DP(x); x += 2; + vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); + dot0 += (vx0r * vy0r); + vx5 = LD_DP(x); x += 2; + vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); + dot1 += (vx0i * vy0r); + vy4 = LD_DP(y); y += 2; + vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); + dot2 += (vx1r * vy1r); + vy5 = LD_DP(y); y += 2; + vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); + dot3 += (vx1i * vy1r); + vx6 = LD_DP(x); x += 2; + vx7 = LD_DP(x); x += 2; + vy6 = LD_DP(y); y += 2; + vy7 = LD_DP(y); y += 2; + vx8 = LD_DP(x); x += 2; + dot0 -= (vx0i * vy0i); + vx9 = LD_DP(x); x += 2; + vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); + dot1 += (vx0r * vy0i); + vy8 = LD_DP(y); y += 2; + vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); + dot2 -= (vx1i * vy1i); + vy9 = LD_DP(y); y += 2; + vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); + dot3 += (vx1r * vy1i); + vx10 = LD_DP(x); x += 2; + vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); + vx11 = LD_DP(x); x += 2; + vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); + dot4 += (vx2r * vy2r); + vy10 = LD_DP(y); y += 2; + vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); + dot5 += (vx2i * vy2r); + vy11 = LD_DP(y); y += 2; + vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); + vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); + dot6 += (vx3r * vy3r); + vx0r = (v2f64) __msa_pckev_d((v2i64) vx9, (v2i64) vx8); + dot7 += (vx3i * vy3r); + vx0i = (v2f64) __msa_pckod_d((v2i64) vx9, (v2i64) vx8); + vy0r = (v2f64) __msa_pckev_d((v2i64) vy9, (v2i64) vy8); + vx2 = vx10; + vy0i = (v2f64) __msa_pckod_d((v2i64) vy9, (v2i64) vy8); + vx3 = vx11; + dot4 -= (vx2i * vy2i); + vy2 = vy10; + dot5 += (vx2r * vy2i); + vy3 = vy11; + dot6 -= (vx3i * vy3i); + dot7 += (vx3r * vy3i); + } + + vx4 = LD_DP(x); x += 2; + vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); dot0 += (vx0r * vy0r); - dot0 OP1 (vx0i * vy0i); - dot1 OP2 (vx0i * vy0r); - dot1 += (vx0r * vy0i); - + vx5 = LD_DP(x); x += 2; + vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); + dot1 += (vx0i * vy0r); + vy4 = LD_DP(y); y += 2; + vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); - dot2 OP1 (vx1i * vy1i); - dot3 OP2 (vx1i * vy1r); + vy5 = LD_DP(y); y += 2; + vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); + dot3 += (vx1i * vy1r); + vx6 = LD_DP(x); x += 2; + vx7 = LD_DP(x); x += 2; + vy6 = LD_DP(y); y += 2; + vy7 = LD_DP(y); y += 2; + dot0 -= (vx0i * vy0i); + vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); + dot1 += (vx0r * vy0i); + vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); + dot2 -= (vx1i * vy1i); + vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); - + vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); + vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); - dot4 OP1 (vx2i * vy2i); - dot5 OP2 (vx2i * vy2r); - dot5 += (vx2r * vy2i); - + vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); + dot5 += (vx2i * vy2r); + vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); + vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); - dot6 OP1 (vx3i * vy3i); - dot7 OP2 (vx3i * vy3r); + dot7 += (vx3i * vy3r); + dot4 -= (vx2i * vy2i); + dot5 += (vx2r * vy2i); + dot6 -= (vx3i * vy3i); dot7 += (vx3r * vy3i); } } - else -#endif - for (i = (n >> 3); i--;) + else if (n > 7) { - LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); - LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); - PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); - PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); - PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); - PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); - PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); - PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); - PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); + for (i = (n >> 3) - 1; i--;) + { + vx4 = LD_DP(x); x += inc_x2; + vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); + dot0 += (vx0r * vy0r); + vx5 = LD_DP(x); x += inc_x2; + vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); + dot1 += (vx0i * vy0r); + vy4 = LD_DP(y); y += inc_y2; + vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); + dot2 += (vx1r * vy1r); + vy5 = LD_DP(y); y += inc_y2; + vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); + dot3 += (vx1i * vy1r); + vx6 = LD_DP(x); x += inc_x2; + vx7 = LD_DP(x); x += inc_x2; + vy6 = LD_DP(y); y += inc_y2; + vy7 = LD_DP(y); y += inc_y2; + vx8 = LD_DP(x); x += inc_x2; + dot0 -= (vx0i * vy0i); + vx9 = LD_DP(x); x += inc_x2; + vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); + dot1 += (vx0r * vy0i); + vy8 = LD_DP(y); y += inc_y2; + vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); + dot2 -= (vx1i * vy1i); + vy9 = LD_DP(y); y += inc_y2; + vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); + dot3 += (vx1r * vy1i); + vx10 = LD_DP(x); x += inc_x2; + vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); + vx11 = LD_DP(x); x += inc_x2; + vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); + dot4 += (vx2r * vy2r); + vy10 = LD_DP(y); y += inc_y2; + vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); + dot5 += (vx2i * vy2r); + vy11 = LD_DP(y); y += inc_y2; + vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); + vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); + dot6 += (vx3r * vy3r); + vx0r = (v2f64) __msa_pckev_d((v2i64) vx9, (v2i64) vx8); + dot7 += (vx3i * vy3r); + vx0i = (v2f64) __msa_pckod_d((v2i64) vx9, (v2i64) vx8); + vy0r = (v2f64) __msa_pckev_d((v2i64) vy9, (v2i64) vy8); + vx2 = vx10; + vy0i = (v2f64) __msa_pckod_d((v2i64) vy9, (v2i64) vy8); + vx3 = vx11; + dot4 -= (vx2i * vy2i); + vy2 = vy10; + dot5 += (vx2r * vy2i); + vy3 = vy11; + dot6 -= (vx3i * vy3i); + dot7 += (vx3r * vy3i); + } + + vx4 = LD_DP(x); x += inc_x2; + vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); dot0 += (vx0r * vy0r); - dot0 OP1 (vx0i * vy0i); - dot1 OP2 (vx0i * vy0r); - dot1 += (vx0r * vy0i); - + vx5 = LD_DP(x); x += inc_x2; + vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); + dot1 += (vx0i * vy0r); + vy4 = LD_DP(y); y += inc_y2; + vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); - dot2 OP1 (vx1i * vy1i); - dot3 OP2 (vx1i * vy1r); + vy5 = LD_DP(y); y += inc_y2; + vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); + dot3 += (vx1i * vy1r); + vx6 = LD_DP(x); x += inc_x2; + vx7 = LD_DP(x); x += inc_x2; + vy6 = LD_DP(y); y += inc_y2; + vy7 = LD_DP(y); y += inc_y2; + dot0 -= (vx0i * vy0i); + vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); + dot1 += (vx0r * vy0i); + vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); + dot2 -= (vx1i * vy1i); + vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); - + vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); + vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); - dot4 OP1 (vx2i * vy2i); - dot5 OP2 (vx2i * vy2r); - dot5 += (vx2r * vy2i); - + vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); + dot5 += (vx2i * vy2r); + vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); + vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); - dot6 OP1 (vx3i * vy3i); - dot7 OP2 (vx3i * vy3r); + dot7 += (vx3i * vy3r); + dot4 -= (vx2i * vy2i); + dot5 += (vx2r * vy2i); + dot6 -= (vx3i * vy3i); dot7 += (vx3r * vy3i); } From 9db451acd0538bdf8bd581353bb6dd10f01da2a6 Mon Sep 17 00:00:00 2001 From: kaustubh Date: Tue, 13 Dec 2016 14:02:14 +0530 Subject: [PATCH 2/3] Updated data prefetch in TRSM, ASUM, DOT functions Signed-off-by: kaustubh --- kernel/mips/macros_msa.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index e7a0845cc..a2f90fe59 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef __MACROS_MSA_H__ #define __MACROS_MSA_H__ +#include #include #define ENABLE_PREFETCH From 5f93aa5f87141fd4cb01ff8487cec6c803904647 Mon Sep 17 00:00:00 2001 From: kaustubh Date: Wed, 14 Dec 2016 14:05:11 +0530 Subject: [PATCH 3/3] Updated data prefetch in TRSM, ASUM, DOT functions Signed-off-by: kaustubh --- kernel/mips/cdot_msa.c | 32 ++++++++++----------- kernel/mips/zdot_msa.c | 64 +++++++++++++++++++++--------------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/kernel/mips/cdot_msa.c b/kernel/mips/cdot_msa.c index cb86e5fbd..0999fa08d 100644 --- a/kernel/mips/cdot_msa.c +++ b/kernel/mips/cdot_msa.c @@ -115,25 +115,25 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot0 += (vx0r * vy0r); vx5 = LD_SP(x); x += 4; vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2); - dot1 += (vx0i * vy0r); + dot1 OP2 (vx0i * vy0r); vy4 = LD_SP(y); y += 4; vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2); dot2 += (vx1r * vy1r); vy5 = LD_SP(y); y += 4; vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2); - dot3 += (vx1i * vy1r); + dot3 OP2 (vx1i * vy1r); vx6 = LD_SP(x); x += 4; vx7 = LD_SP(x); x += 4; vy6 = LD_SP(y); y += 4; vy7 = LD_SP(y); y += 4; vx8 = LD_SP(x); x += 4; - dot0 -= (vx0i * vy0i); + dot0 OP1 (vx0i * vy0i); vx9 = LD_SP(x); x += 4; vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4); dot1 += (vx0r * vy0i); vy8 = LD_SP(y); y += 4; vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4); - dot2 -= (vx1i * vy1i); + dot2 OP1 (vx1i * vy1i); vy9 = LD_SP(y); y += 4; vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4); dot3 += (vx1r * vy1i); @@ -144,23 +144,23 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot4 += (vx2r * vy2r); vy10 = LD_SP(y); y += 4; vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6); - dot5 += (vx2i * vy2r); + dot5 OP2 (vx2i * vy2r); vy11 = LD_SP(y); y += 4; vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6); vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6); dot6 += (vx3r * vy3r); vx0r = (v4f32) __msa_pckev_w((v4i32) vx9, (v4i32) vx8); - dot7 += (vx3i * vy3r); + dot7 OP2 (vx3i * vy3r); vx0i = (v4f32) __msa_pckod_w((v4i32) vx9, (v4i32) vx8); vy0r = (v4f32) __msa_pckev_w((v4i32) vy9, (v4i32) vy8); vx2 = vx10; vy0i = (v4f32) __msa_pckod_w((v4i32) vy9, (v4i32) vy8); vx3 = vx11; - dot4 -= (vx2i * vy2i); + dot4 OP1 (vx2i * vy2i); vy2 = vy10; dot5 += (vx2r * vy2i); vy3 = vy11; - dot6 -= (vx3i * vy3i); + dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } @@ -169,36 +169,36 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot0 += (vx0r * vy0r); vx5 = LD_SP(x); x += 4; vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2); - dot1 += (vx0i * vy0r); + dot1 OP2 (vx0i * vy0r); vy4 = LD_SP(y); y += 4; vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2); dot2 += (vx1r * vy1r); vy5 = LD_SP(y); y += 4; vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2); - dot3 += (vx1i * vy1r); + dot3 OP2 (vx1i * vy1r); vx6 = LD_SP(x); x += 4; vx7 = LD_SP(x); x += 4; vy6 = LD_SP(y); y += 4; vy7 = LD_SP(y); y += 4; - dot0 -= (vx0i * vy0i); + dot0 OP1 (vx0i * vy0i); vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4); dot1 += (vx0r * vy0i); vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4); - dot2 -= (vx1i * vy1i); + dot2 OP1 (vx1i * vy1i); vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4); dot3 += (vx1r * vy1i); vy2i = (v4f32) __msa_pckod_w((v4i32) vy5, (v4i32) vy4); vx3r = (v4f32) __msa_pckev_w((v4i32) vx7, (v4i32) vx6); dot4 += (vx2r * vy2r); vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6); - dot5 += (vx2i * vy2r); + dot5 OP2 (vx2i * vy2r); vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6); vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6); dot6 += (vx3r * vy3r); - dot7 += (vx3i * vy3r); - dot4 -= (vx2i * vy2i); + dot7 OP2 (vx3i * vy3r); + dot4 OP1 (vx2i * vy2i); dot5 += (vx2r * vy2i); - dot6 -= (vx3i * vy3i); + dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } diff --git a/kernel/mips/zdot_msa.c b/kernel/mips/zdot_msa.c index 33012fcf1..0d9b3c4fa 100644 --- a/kernel/mips/zdot_msa.c +++ b/kernel/mips/zdot_msa.c @@ -118,25 +118,25 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += 2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); - dot1 += (vx0i * vy0r); + dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += 2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += 2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); - dot3 += (vx1i * vy1r); + dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += 2; vx7 = LD_DP(x); x += 2; vy6 = LD_DP(y); y += 2; vy7 = LD_DP(y); y += 2; vx8 = LD_DP(x); x += 2; - dot0 -= (vx0i * vy0i); + dot0 OP1 (vx0i * vy0i); vx9 = LD_DP(x); x += 2; vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vy8 = LD_DP(y); y += 2; vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); - dot2 -= (vx1i * vy1i); + dot2 OP1 (vx1i * vy1i); vy9 = LD_DP(y); y += 2; vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); @@ -147,23 +147,23 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot4 += (vx2r * vy2r); vy10 = LD_DP(y); y += 2; vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); - dot5 += (vx2i * vy2r); + dot5 OP2 (vx2i * vy2r); vy11 = LD_DP(y); y += 2; vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); vx0r = (v2f64) __msa_pckev_d((v2i64) vx9, (v2i64) vx8); - dot7 += (vx3i * vy3r); + dot7 OP2 (vx3i * vy3r); vx0i = (v2f64) __msa_pckod_d((v2i64) vx9, (v2i64) vx8); vy0r = (v2f64) __msa_pckev_d((v2i64) vy9, (v2i64) vy8); vx2 = vx10; vy0i = (v2f64) __msa_pckod_d((v2i64) vy9, (v2i64) vy8); vx3 = vx11; - dot4 -= (vx2i * vy2i); + dot4 OP1 (vx2i * vy2i); vy2 = vy10; dot5 += (vx2r * vy2i); vy3 = vy11; - dot6 -= (vx3i * vy3i); + dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } @@ -172,36 +172,36 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += 2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); - dot1 += (vx0i * vy0r); + dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += 2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += 2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); - dot3 += (vx1i * vy1r); + dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += 2; vx7 = LD_DP(x); x += 2; vy6 = LD_DP(y); y += 2; vy7 = LD_DP(y); y += 2; - dot0 -= (vx0i * vy0i); + dot0 OP1 (vx0i * vy0i); vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); - dot2 -= (vx1i * vy1i); + dot2 OP1 (vx1i * vy1i); vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); - dot5 += (vx2i * vy2r); + dot5 OP2 (vx2i * vy2r); vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); - dot7 += (vx3i * vy3r); - dot4 -= (vx2i * vy2i); + dot7 OP2 (vx3i * vy3r); + dot4 OP1 (vx2i * vy2i); dot5 += (vx2r * vy2i); - dot6 -= (vx3i * vy3i); + dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } } @@ -220,25 +220,25 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += inc_x2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); - dot1 += (vx0i * vy0r); + dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += inc_y2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += inc_y2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); - dot3 += (vx1i * vy1r); + dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += inc_x2; vx7 = LD_DP(x); x += inc_x2; vy6 = LD_DP(y); y += inc_y2; vy7 = LD_DP(y); y += inc_y2; vx8 = LD_DP(x); x += inc_x2; - dot0 -= (vx0i * vy0i); + dot0 OP1 (vx0i * vy0i); vx9 = LD_DP(x); x += inc_x2; vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vy8 = LD_DP(y); y += inc_y2; vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); - dot2 -= (vx1i * vy1i); + dot2 OP1 (vx1i * vy1i); vy9 = LD_DP(y); y += inc_y2; vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); @@ -249,23 +249,23 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot4 += (vx2r * vy2r); vy10 = LD_DP(y); y += inc_y2; vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); - dot5 += (vx2i * vy2r); + dot5 OP2 (vx2i * vy2r); vy11 = LD_DP(y); y += inc_y2; vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); vx0r = (v2f64) __msa_pckev_d((v2i64) vx9, (v2i64) vx8); - dot7 += (vx3i * vy3r); + dot7 OP2 (vx3i * vy3r); vx0i = (v2f64) __msa_pckod_d((v2i64) vx9, (v2i64) vx8); vy0r = (v2f64) __msa_pckev_d((v2i64) vy9, (v2i64) vy8); vx2 = vx10; vy0i = (v2f64) __msa_pckod_d((v2i64) vy9, (v2i64) vy8); vx3 = vx11; - dot4 -= (vx2i * vy2i); + dot4 OP1 (vx2i * vy2i); vy2 = vy10; dot5 += (vx2r * vy2i); vy3 = vy11; - dot6 -= (vx3i * vy3i); + dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } @@ -274,36 +274,36 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += inc_x2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); - dot1 += (vx0i * vy0r); + dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += inc_y2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += inc_y2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); - dot3 += (vx1i * vy1r); + dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += inc_x2; vx7 = LD_DP(x); x += inc_x2; vy6 = LD_DP(y); y += inc_y2; vy7 = LD_DP(y); y += inc_y2; - dot0 -= (vx0i * vy0i); + dot0 OP1 (vx0i * vy0i); vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); - dot2 -= (vx1i * vy1i); + dot2 OP1 (vx1i * vy1i); vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); - dot5 += (vx2i * vy2r); + dot5 OP2 (vx2i * vy2r); vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); - dot7 += (vx3i * vy3r); - dot4 -= (vx2i * vy2i); + dot7 OP2 (vx3i * vy3r); + dot4 OP1 (vx2i * vy2i); dot5 += (vx2r * vy2i); - dot6 -= (vx3i * vy3i); + dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); }