From 00abce3b9379712d75ebb9223d8247f48533d2b0 Mon Sep 17 00:00:00 2001 From: kaustubh Date: Tue, 22 Nov 2016 11:21:03 +0530 Subject: [PATCH] Add data prefetch in DOT and ASUM functions Signed-off-by: kaustubh --- kernel/mips/casum_msa.c | 267 +++++++---------------- kernel/mips/cdot_msa.c | 471 ++++++++++++++++++---------------------- kernel/mips/dasum_msa.c | 203 +++++++---------- kernel/mips/ddot_msa.c | 128 ++++------- kernel/mips/sasum_msa.c | 372 +++++++++++++------------------ kernel/mips/sdot_msa.c | 136 ++++-------- kernel/mips/zasum_msa.c | 288 ++++++++++++++---------- kernel/mips/zdot_msa.c | 333 +++++++++++++++------------- 8 files changed, 961 insertions(+), 1237 deletions(-) diff --git a/kernel/mips/casum_msa.c b/kernel/mips/casum_msa.c index 454573d56..5bb948392 100644 --- a/kernel/mips/casum_msa.c +++ b/kernel/mips/casum_msa.c @@ -36,40 +36,51 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG i, inc_x2; FLOAT sumf = 0.0; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; - v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; - v4f32 zero_v = {0}; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + v4f32 sum_abs0 = {0, 0, 0, 0}; + v4f32 sum_abs1 = {0, 0, 0, 0}; + v4f32 sum_abs2 = {0, 0, 0, 0}; + v4f32 sum_abs3 = {0, 0, 0, 0}; v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { - if (n > 15) +#ifdef ENABLE_PREFETCH + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) { - n -= 16; + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(FLOAT); + x_pref = x + pref_offset + 128; +#endif + + for (i = (n >> 5); i--;) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 0(%[x_pref])\n\t" + "pref 0, 32(%[x_pref])\n\t" + "pref 0, 64(%[x_pref])\n\t" + "pref 0, 96(%[x_pref])\n\t" + "pref 0, 128(%[x_pref])\n\t" + "pref 0, 160(%[x_pref])\n\t" + "pref 0, 192(%[x_pref])\n\t" + "pref 0, 224(%[x_pref])\n\t" + + : : [x_pref] "r" (x_pref) + ); + + x_pref += 64; +#endif LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); - - sum_abs0 = AND_VEC_W(src0); - sum_abs1 = AND_VEC_W(src1); - sum_abs2 = AND_VEC_W(src2); - sum_abs3 = AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - sum_abs1 += AND_VEC_W(src5); - sum_abs2 += AND_VEC_W(src6); - sum_abs3 += AND_VEC_W(src7); - } - else - { - sum_abs0 = zero_v; - sum_abs1 = zero_v; - sum_abs2 = zero_v; - sum_abs3 = zero_v; - } - - for (i = (n >> 4); i--;) - { - LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); @@ -79,13 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); + sum_abs0 += AND_VEC_W(src8); + sum_abs1 += AND_VEC_W(src9); + sum_abs2 += AND_VEC_W(src10); + sum_abs3 += AND_VEC_W(src11); + sum_abs0 += AND_VEC_W(src12); + sum_abs1 += AND_VEC_W(src13); + sum_abs2 += AND_VEC_W(src14); + sum_abs3 += AND_VEC_W(src15); } - if (n & 15) + if (n & 31) { - if ((n & 8) && (n & 4) && (n & 2)) + if (n & 16) { - LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); @@ -94,65 +113,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; + sum_abs3 += AND_VEC_W(src7); } - else if ((n & 8) && (n & 4)) - { - LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - sum_abs3 += AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - sum_abs1 += AND_VEC_W(src5); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else if ((n & 8) && (n & 2)) - { - LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); - - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - sum_abs3 += AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else if ((n & 4) && (n & 2)) - { - LD_SP3_INC(x, 4, src0, src1, src2); - - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else if (n & 8) + if (n & 8) { LD_SP4_INC(x, 4, src0, src1, src2, src3); @@ -160,97 +124,45 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; } - else if (n & 4) + + if (n & 4) { LD_SP2_INC(x, 4, src0, src1); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; } - else if (n & 2) + + if (n & 2) { src0 = LD_SP(x); x += 4; sum_abs0 += AND_VEC_W(src0); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else - { - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; } if (n & 1) { - sumf += fabsf(*(x + 0)); + sumf += fabsf(*x); sumf += fabsf(*(x + 1)); } } - else - { - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - sumf = sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; } else { inc_x2 = 2 * inc_x; - if (n > 8) - { - n -= 8; - - LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); - - sum_abs0 = AND_VEC_W(src0); - sum_abs1 = AND_VEC_W(src1); - sum_abs2 = AND_VEC_W(src2); - sum_abs3 = AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - sum_abs1 += AND_VEC_W(src5); - sum_abs2 += AND_VEC_W(src6); - sum_abs3 += AND_VEC_W(src7); - } - else - { - sum_abs0 = zero_v; - sum_abs1 = zero_v; - sum_abs2 = zero_v; - sum_abs3 = zero_v; - } - - for (i = (n >> 3); i--;) + for (i = (n >> 4); i--;) { LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); + LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); @@ -260,13 +172,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); + sum_abs0 += AND_VEC_W(src8); + sum_abs1 += AND_VEC_W(src9); + sum_abs2 += AND_VEC_W(src10); + sum_abs3 += AND_VEC_W(src11); + sum_abs0 += AND_VEC_W(src12); + sum_abs1 += AND_VEC_W(src13); + sum_abs2 += AND_VEC_W(src14); + sum_abs3 += AND_VEC_W(src15); } - if (n & 7) + if (n & 15) { - if ((n & 4) && (n & 2) && (n & 1)) + if (n & 8) { - LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6); + LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); @@ -275,37 +195,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); + sum_abs3 += AND_VEC_W(src7); } - else if ((n & 4) && (n & 2)) - { - LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5); - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - sum_abs3 += AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - sum_abs1 += AND_VEC_W(src5); - } - else if ((n & 4) && (n & 1)) - { - LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4); - - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - sum_abs3 += AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - } - else if ((n & 2) && (n & 1)) - { - LD_SP3_INC(x, inc_x2, src0, src1, src2); - - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - } - else if (n & 4) + if (n & 4) { LD_SP4_INC(x, inc_x2, src0, src1, src2, src3); @@ -314,22 +207,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); } - else if (n & 2) + + if (n & 2) { LD_SP2_INC(x, inc_x2, src0, src1); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); } - else if (n & 1) + + if (n & 1) { - src0 = LD_SP(x); x += inc_x2; + src0 = LD_SP(x); sum_abs0 += AND_VEC_W(src0); } } - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf = sum_abs0[0] + sum_abs0[1]; } diff --git a/kernel/mips/cdot_msa.c b/kernel/mips/cdot_msa.c index bf9f6b7e2..2079c9e76 100644 --- a/kernel/mips/cdot_msa.c +++ b/kernel/mips/cdot_msa.c @@ -29,333 +29,274 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "macros_msa.h" #if !defined(CONJ) - #define OP2 += - #define OP3 - - #define OP4 + + #define OP1 -= + #define OP2 += + #define OP3 - + #define OP4 + #else - #define OP2 -= - #define OP3 + - #define OP4 - + #define OP1 += + #define OP2 -= + #define OP3 + + #define OP4 - #endif -#define DOT16_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); \ - \ - dot0 += (vx1r * vy1r); \ - dot0 OPR0## = (vx1i * vy1i); \ - dot1 OPR1## = (vx1i * vy1r); \ - dot1 += (vx1r * vy1i); \ - \ - dot0 += (vx2r * vy2r); \ - dot0 OPR0## = (vx2i * vy2i); \ - dot1 OPR1## = (vx2i * vy2r); \ - dot1 += (vx2r * vy2i); \ - \ - dot0 += (vx3r * vy3r); \ - dot0 OPR0## = (vx3i * vy3i); \ - dot1 OPR1## = (vx3i * vy3r); \ - dot1 += (vx3r * vy3i); - -#define DOT12_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); \ - \ - dot0 += (vx1r * vy1r); \ - dot0 OPR0## = (vx1i * vy1i); \ - dot1 OPR1## = (vx1i * vy1r); \ - dot1 += (vx1r * vy1i); \ - \ - dot0 += (vx2r * vy2r); \ - dot0 OPR0## = (vx2i * vy2i); \ - dot1 OPR1## = (vx2i * vy2r); \ - dot1 += (vx2r * vy2i); - -#define DOT8_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); \ - \ - dot0 += (vx1r * vy1r); \ - dot0 OPR0## = (vx1i * vy1i); \ - dot1 OPR1## = (vx1i * vy1r); \ - dot1 += (vx1r * vy1i); - -#define DOT4_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); - -/* return float, x,y float */ -/* cdotc - CONJ */ -/* cdotu - !CONJ */ -#ifndef _MSC_VER -#include -FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -#else OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -#endif { BLASLONG i = 0; FLOAT dot[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; + BLASLONG inc_x2, inc_y2; FLOAT x0, x1, x2, x3, x4, x5, x6, x7; FLOAT y0, y1, y2, y3, y4, y5, y6, y7; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; - v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; - v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; + v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; + v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; v4f32 dot0 = {0, 0, 0, 0}; v4f32 dot1 = {0, 0, 0, 0}; - openblas_complex_float result; + v4f32 dot2 = {0, 0, 0, 0}; + v4f32 dot3 = {0, 0, 0, 0}; + v4f32 dot4 = {0, 0, 0, 0}; + v4f32 dot5 = {0, 0, 0, 0}; + v4f32 dot6 = {0, 0, 0, 0}; + v4f32 dot7 = {0, 0, 0, 0}; + OPENBLAS_COMPLEX_FLOAT result; dot[0] = 0.0; dot[1] = 0.0; - __real__(result) = 0.0; - __imag__(result) = 0.0; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; - if ( n < 1 ) return(result); + if (n < 1) return (result); if ((1 == inc_x) && (1 == inc_y)) { +#ifdef ENABLE_PREFETCH + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(FLOAT); + x_pref = x + pref_offset + 64; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(FLOAT); + y_pref = y + pref_offset + 64; +#endif + for (i = (n >> 4); i--;) { - LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); - LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 0(%[x_pref])\n\t" + "pref 0, 32(%[x_pref])\n\t" + "pref 0, 64(%[x_pref])\n\t" + "pref 0, 96(%[x_pref])\n\t" + "pref 0, 0(%[y_pref])\n\t" + "pref 0, 32(%[y_pref])\n\t" + "pref 0, 64(%[y_pref])\n\t" + "pref 0, 96(%[y_pref])\n\t" - PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); - PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); - PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); - PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); + : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref) + ); - PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); - PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); - PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); - PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); + x_pref += 32; + y_pref += 32; +#endif - #if !defined(CONJ) - DOT16_KERNEL(-, +); - #else - DOT16_KERNEL(+, -); - #endif + LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); + PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); + PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); + + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); + PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); + PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); + + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); + + dot2 += (vx1r * vy1r); + dot2 OP1 (vx1i * vy1i); + dot3 OP2 (vx1i * vy1r); + dot3 += (vx1r * vy1i); + + dot4 += (vx2r * vy2r); + dot4 OP1 (vx2i * vy2i); + dot5 OP2 (vx2i * vy2r); + dot5 += (vx2r * vy2i); + + dot6 += (vx3r * vy3r); + dot6 OP1 (vx3i * vy3i); + dot7 OP2 (vx3i * vy3r); + dot7 += (vx3r * vy3i); } if (n & 15) { - if ((n & 8) && (n & 4)) + if (n & 8) { - LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); - LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); - LD_SP2_INC(x, 4, vx4, vx5); - LD_SP2_INC(y, 4, vy4, vy5); + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); - PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); - PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); - PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); - PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); - PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); - PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); + PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); - #if !defined(CONJ) - DOT12_KERNEL(-, +); - #else - DOT12_KERNEL(+, -); - #endif - } - else if (n & 8) - { - LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); - LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); - PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); - PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); - - PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); - PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); - - #if !defined(CONJ) - DOT8_KERNEL(-, +); - #else - DOT8_KERNEL(+, -); - #endif - } - else if (n & 4) - { - LD_SP2_INC(x, 4, vx0, vx1); - LD_SP2_INC(y, 4, vy0, vy1); - PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); - PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); - - #if !defined(CONJ) - DOT4_KERNEL(-, +); - #else - DOT4_KERNEL(+, -); - #endif + dot2 += (vx1r * vy1r); + dot2 OP1 (vx1i * vy1i); + dot3 OP2 (vx1i * vy1r); + dot3 += (vx1r * vy1i); } - if ((n & 2) && (n & 1)) - { - LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5); - LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5); + if (n & 4) + { + LD_SP2_INC(x, 4, vx0, vx1); + LD_SP2_INC(y, 4, vy0, vy1); + PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); + PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); - dot[0] += ( x0 * y0 OP3 x1 * y1 ); - dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); + } - dot[0] += ( x2 * y2 OP3 x3 * y3 ); - dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); - - dot[0] += ( x4 * y4 OP3 x5 * y5 ); - dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); - } - else if (n & 2) - { + if (n & 2) + { LD_GP4_INC(x, 1, x0, x1, x2, x3); LD_GP4_INC(y, 1, y0, y1, y2, y3); - dot[0] += ( x0 * y0 OP3 x1 * y1 ); - dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + dot[0] += (x0 * y0 OP3 x1 * y1); + dot[1] OP2 (x1 * y0 OP4 x0 * y1); - dot[0] += ( x2 * y2 OP3 x3 * y3 ); - dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); - } - else if (n & 1) - { + dot[0] += (x2 * y2 OP3 x3 * y3); + dot[1] OP2 (x3 * y2 OP4 x2 * y3); + } + + if (n & 1) + { LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); - dot[0] += ( x0 * y0 OP3 x1 * y1 ); - dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); - } + dot[0] += (x0 * y0 OP3 x1 * y1); + dot[1] OP2 (x1 * y0 OP4 x0 * y1); + } } - dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); - dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); - } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + dot0 += dot2 + dot4 + dot6; + dot1 += dot3 + dot5 + dot7; - for (i = (n >> 2); i--;) - { - x0 = *x; - x1 = *(x + 1); - x += inc_x2; - x2 = *x; - x3 = *(x + 1); - x += inc_x2; - x4 = *x; - x5 = *(x + 1); - x += inc_x2; - x6 = *x; - x7 = *(x + 1); - x += inc_x2; + dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); + dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); + } + else + { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - y0 = *y; - y1 = *(y + 1); - y += inc_y2; - y2 = *y; - y3 = *(y + 1); - y += inc_y2; - y4 = *y; - y5 = *(y + 1); - y += inc_y2; - y6 = *y; - y7 = *(y + 1); - y += inc_y2; + for (i = (n >> 2); i--;) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; + x4 = *x; + x5 = *(x + 1); + x += inc_x2; + x6 = *x; + x7 = *(x + 1); + x += inc_x2; - dot[0] += ( x0 * y0 OP3 x1 * y1 ); - dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; + y4 = *y; + y5 = *(y + 1); + y += inc_y2; + y6 = *y; + y7 = *(y + 1); + y += inc_y2; - dot[0] += ( x2 * y2 OP3 x3 * y3 ); - dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + dot[0] += (x0 * y0 OP3 x1 * y1); + dot[1] OP2 (x1 * y0 OP4 x0 * y1); - dot[0] += ( x4 * y4 OP3 x5 * y5 ); - dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); + dot[0] += (x2 * y2 OP3 x3 * y3); + dot[1] OP2 (x3 * y2 OP4 x2 * y3); - dot[0] += ( x6 * y6 OP3 x7 * y7 ); - dot[1] OP2 ( x7 * y6 OP4 x6 * y7 ); - } + dot[0] += (x4 * y4 OP3 x5 * y5); + dot[1] OP2 (x5 * y4 OP4 x4 * y5); - if ((n & 2) && (n & 1)) - { - x0 = *x; - x1 = *(x + 1); - x += inc_x2; - x2 = *x; - x3 = *(x + 1); - x += inc_x2; - x4 = *x; - x5 = *(x + 1); - x += inc_x2; + dot[0] += (x6 * y6 OP3 x7 * y7); + dot[1] OP2 (x7 * y6 OP4 x6 * y7); + } - y0 = *y; - y1 = *(y + 1); - y += inc_y2; - y2 = *y; - y3 = *(y + 1); - y += inc_y2; - y4 = *y; - y5 = *(y + 1); - y += inc_y2; + if (n & 2) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; + x2 = *x; + x3 = *(x + 1); + x += inc_x2; - dot[0] += ( x0 * y0 OP3 x1 * y1 ); - dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + y0 = *y; + y1 = *(y + 1); + y += inc_y2; + y2 = *y; + y3 = *(y + 1); + y += inc_y2; - dot[0] += ( x2 * y2 OP3 x3 * y3 ); - dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); + dot[0] += (x0 * y0 OP3 x1 * y1); + dot[1] OP2 (x1 * y0 OP4 x0 * y1); - dot[0] += ( x4 * y4 OP3 x5 * y5 ); - dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); - } - else if (n & 2) - { - x0 = *x; - x1 = *(x + 1); - x += inc_x2; - x2 = *x; - x3 = *(x + 1); - x += inc_x2; + dot[0] += (x2 * y2 OP3 x3 * y3); + dot[1] OP2 (x3 * y2 OP4 x2 * y3); + } - y0 = *y; - y1 = *(y + 1); - y += inc_y2; - y2 = *y; - y3 = *(y + 1); - y += inc_y2; + if (n & 1) + { + x0 = *x; + x1 = *(x + 1); + x += inc_x2; - dot[0] += ( x0 * y0 OP3 x1 * y1 ); - dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); + y0 = *y; + y1 = *(y + 1); + y += inc_y2; - dot[0] += ( x2 * y2 OP3 x3 * y3 ); - dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); - } - else if (n & 1) - { - x0 = *x; - x1 = *(x + 1); - x += inc_x2; + dot[0] += (x0 * y0 OP3 x1 * y1); + dot[1] OP2 (x1 * y0 OP4 x0 * y1); + } + } - y0 = *y; - y1 = *(y + 1); - y += inc_y2; + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; - dot[0] += ( x0 * y0 OP3 x1 * y1 ); - dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); - } - } - - __real__(result) = dot[0]; - __imag__(result) = dot[1]; - - return(result); + return (result); } diff --git a/kernel/mips/dasum_msa.c b/kernel/mips/dasum_msa.c index a3641cd50..1128d63eb 100644 --- a/kernel/mips/dasum_msa.c +++ b/kernel/mips/dasum_msa.c @@ -36,40 +36,51 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG i; FLOAT sumf = 0.0; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; - v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; - v2f64 zero_v = {0}; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + v2f64 sum_abs0 = {0, 0}; + v2f64 sum_abs1 = {0, 0}; + v2f64 sum_abs2 = {0, 0}; + v2f64 sum_abs3 = {0, 0}; v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { - if (n > 15) +#ifdef ENABLE_PREFETCH + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) { - n -= 16; + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(FLOAT); + x_pref = x + pref_offset + 64; +#endif + + for (i = (n >> 5); i--;) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 0(%[x_pref])\n\t" + "pref 0, 32(%[x_pref])\n\t" + "pref 0, 64(%[x_pref])\n\t" + "pref 0, 96(%[x_pref])\n\t" + "pref 0, 128(%[x_pref])\n\t" + "pref 0, 160(%[x_pref])\n\t" + "pref 0, 192(%[x_pref])\n\t" + "pref 0, 224(%[x_pref])\n\t" + + : : [x_pref] "r" (x_pref) + ); + + x_pref += 32; +#endif LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); - - sum_abs0 = AND_VEC_D(src0); - sum_abs1 = AND_VEC_D(src1); - sum_abs2 = AND_VEC_D(src2); - sum_abs3 = AND_VEC_D(src3); - sum_abs0 += AND_VEC_D(src4); - sum_abs1 += AND_VEC_D(src5); - sum_abs2 += AND_VEC_D(src6); - sum_abs3 += AND_VEC_D(src7); - } - else - { - sum_abs0 = zero_v; - sum_abs1 = zero_v; - sum_abs2 = zero_v; - sum_abs3 = zero_v; - } - - for (i = (n >> 4); i--;) - { - LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); @@ -79,13 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); } - if (n & 15) + if (n & 31) { - if ((n & 8) && (n & 4) && (n & 2)) + if (n & 16) { - LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6); + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); @@ -94,37 +113,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); } - else if ((n & 8) && (n & 4)) - { - LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5); - sum_abs0 += AND_VEC_D(src0); - sum_abs1 += AND_VEC_D(src1); - sum_abs2 += AND_VEC_D(src2); - sum_abs3 += AND_VEC_D(src3); - sum_abs0 += AND_VEC_D(src4); - sum_abs1 += AND_VEC_D(src5); - } - else if ((n & 8) && (n & 2)) - { - LD_DP5_INC(x, 2, src0, src1, src2, src3, src4); - - sum_abs0 += AND_VEC_D(src0); - sum_abs1 += AND_VEC_D(src1); - sum_abs2 += AND_VEC_D(src2); - sum_abs3 += AND_VEC_D(src3); - sum_abs0 += AND_VEC_D(src4); - } - else if ((n & 4) && (n & 2)) - { - LD_DP3_INC(x, 2, src0, src1, src2); - - sum_abs0 += AND_VEC_D(src0); - sum_abs1 += AND_VEC_D(src1); - sum_abs2 += AND_VEC_D(src2); - } - else if (n & 8) + if (n & 8) { LD_DP4_INC(x, 2, src0, src1, src2, src3); @@ -133,64 +125,38 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); } - else if (n & 4) + + if (n & 4) { LD_DP2_INC(x, 2, src0, src1); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); } - else if (n & 2) + + if (n & 2) { src0 = LD_DP(x); x += 2; sum_abs0 += AND_VEC_D(src0); } - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf = sum_abs0[0] + sum_abs0[1]; - if (n & 1) { sumf += fabs(*x); } } - else - { - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - sumf = sum_abs0[0] + sum_abs0[1]; - } + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0] + sum_abs0[1]; } else { - if (n > 8) - { - n -= 8; - - LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); - - sum_abs0 = AND_VEC_D(src0); - sum_abs1 = AND_VEC_D(src1); - sum_abs2 = AND_VEC_D(src2); - sum_abs3 = AND_VEC_D(src3); - sum_abs0 += AND_VEC_D(src4); - sum_abs1 += AND_VEC_D(src5); - sum_abs2 += AND_VEC_D(src6); - sum_abs3 += AND_VEC_D(src7); - } - else - { - sum_abs0 = zero_v; - sum_abs1 = zero_v; - sum_abs2 = zero_v; - sum_abs3 = zero_v; - } - - for (i = (n >> 3); i--;) + for (i = (n >> 4); i--;) { LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); @@ -200,13 +166,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); } - if (n & 7) + if (n & 15) { - if ((n & 4) && (n & 2) && (n & 1)) + if (n & 8) { - LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6); + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); @@ -215,37 +189,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); } - else if ((n & 4) && (n & 2)) - { - LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5); - sum_abs0 += AND_VEC_D(src0); - sum_abs1 += AND_VEC_D(src1); - sum_abs2 += AND_VEC_D(src2); - sum_abs3 += AND_VEC_D(src3); - sum_abs0 += AND_VEC_D(src4); - sum_abs1 += AND_VEC_D(src5); - } - else if ((n & 4) && (n & 1)) - { - LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4); - - sum_abs0 += AND_VEC_D(src0); - sum_abs1 += AND_VEC_D(src1); - sum_abs2 += AND_VEC_D(src2); - sum_abs3 += AND_VEC_D(src3); - sum_abs0 += AND_VEC_D(src4); - } - else if ((n & 2) && (n & 1)) - { - LD_DP3_INC(x, inc_x, src0, src1, src2); - - sum_abs0 += AND_VEC_D(src0); - sum_abs1 += AND_VEC_D(src1); - sum_abs2 += AND_VEC_D(src2); - } - else if (n & 4) + if (n & 4) { LD_DP4_INC(x, inc_x, src0, src1, src2, src3); @@ -254,14 +201,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); } - else if (n & 2) + + if (n & 2) { LD_DP2_INC(x, inc_x, src0, src1); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); } - else if (n & 1) + + if (n & 1) { src0 = LD_DP(x); @@ -269,7 +218,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf = sum_abs0[0]; } diff --git a/kernel/mips/ddot_msa.c b/kernel/mips/ddot_msa.c index b56e10135..b92f3132a 100644 --- a/kernel/mips/ddot_msa.c +++ b/kernel/mips/ddot_msa.c @@ -28,105 +28,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -/* return float, x,y float */ -#if defined(DSDOT) -double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -#else FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -#endif { BLASLONG i = 0; - double dot = 0.0; + FLOAT dot = 0.0; FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; v2f64 dot0 = {0, 0}; + v2f64 dot1 = {0, 0}; + v2f64 dot2 = {0, 0}; + v2f64 dot3 = {0, 0}; - if (n < 0) return (dot); + if (n < 1) return (dot); if ((1 == inc_x) && (1 == inc_y)) { for (i = (n >> 4); i--;) { - LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); - LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 256(%[x])\n\t" + "pref 0, 288(%[x])\n\t" + "pref 0, 320(%[x])\n\t" + "pref 0, 352(%[x])\n\t" + "pref 0, 256(%[y])\n\t" + "pref 0, 288(%[y])\n\t" + "pref 0, 320(%[y])\n\t" + "pref 0, 352(%[y])\n\t" + + : : [x] "r" (x), [y] "r" (y) + ); +#endif dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); + dot1 += (vy1 * vx1); + dot2 += (vy2 * vx2); + dot3 += (vy3 * vx3); dot0 += (vy4 * vx4); - dot0 += (vy5 * vx5); - dot0 += (vy6 * vx6); - dot0 += (vy7 * vx7); + dot1 += (vy5 * vx5); + dot2 += (vy6 * vx6); + dot3 += (vy7 * vx7); } if (n & 15) { - if ((n & 8) && (n & 4) && (n & 2)) - { - LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6); - LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6); - - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); - dot0 += (vy4 * vx4); - dot0 += (vy5 * vx5); - dot0 += (vy6 * vx6); - } - else if ((n & 8) && (n & 4)) - { - LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5); - LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5); - - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); - dot0 += (vy4 * vx4); - dot0 += (vy5 * vx5); - } - else if ((n & 8) && (n & 2)) - { - LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4); - LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4); - - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); - dot0 += (vy4 * vx4); - } - else if ((n & 4) && (n & 2)) - { - LD_DP3_INC(x, 2, vx0, vx1, vx2); - LD_DP3_INC(y, 2, vy0, vy1, vy2); - - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - } - else if (n & 8) + if (n & 8) { - LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); - LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); + LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); + dot1 += (vy1 * vx1); + dot2 += (vy2 * vx2); + dot3 += (vy3 * vx3); } - else if (n & 4) + + if (n & 4) { - LD_DP2_INC(x, 2, vx0, vx1); - LD_DP2_INC(y, 2, vy0, vy1); + LD_DP2_INC(x, 2, vx0, vx1); + LD_DP2_INC(y, 2, vy0, vy1); dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); + dot1 += (vy1 * vx1); } - else if (n & 2) + + if (n & 2) { vx0 = LD_DP(x); x += 2; vy0 = LD_DP(y); y += 2; @@ -143,6 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } } + dot0 += dot1 + dot2 + dot3; + dot += dot0[0]; dot += dot0[1]; } @@ -159,16 +131,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += (y3 * x3); } - if ((n & 2) && (n & 1)) - { - LD_GP3_INC(x, inc_x, x0, x1, x2); - LD_GP3_INC(y, inc_y, y0, y1, y2); - - dot += (y0 * x0); - dot += (y1 * x1); - dot += (y2 * x2); - } - else if (n & 2) + if (n & 2) { LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); @@ -176,7 +139,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += (y0 * x0); dot += (y1 * x1); } - else if (n & 1) + + if (n & 1) { x0 = *x; y0 = *y; diff --git a/kernel/mips/sasum_msa.c b/kernel/mips/sasum_msa.c index e968f8307..e15332f85 100644 --- a/kernel/mips/sasum_msa.c +++ b/kernel/mips/sasum_msa.c @@ -34,42 +34,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - FLOAT data0, data1, data2, sumf = 0.0; + FLOAT data0, data1, sumf = 0.0; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; - v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; - v4f32 zero_v = {0}; + v4f32 src8, src9, src10, src11, src12, src13, src14, src15; + v4f32 sum_abs0 = {0, 0, 0, 0}; + v4f32 sum_abs1 = {0, 0, 0, 0}; + v4f32 sum_abs2 = {0, 0, 0, 0}; + v4f32 sum_abs3 = {0, 0, 0, 0}; + v4f32 zero_v = {0, 0, 0, 0}; v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { - if (n > 31) +#ifdef ENABLE_PREFETCH + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) { - n -= 32; + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(FLOAT); + x_pref = x + pref_offset + 128; +#endif + + for (i = 0; i < (n >> 6); i++) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 0(%[x_pref])\n\t" + "pref 0, 32(%[x_pref])\n\t" + "pref 0, 64(%[x_pref])\n\t" + "pref 0, 96(%[x_pref])\n\t" + "pref 0, 128(%[x_pref])\n\t" + "pref 0, 160(%[x_pref])\n\t" + "pref 0, 192(%[x_pref])\n\t" + "pref 0, 224(%[x_pref])\n\t" + + : : [x_pref] "r" (x_pref) + ); + + x_pref += 64; +#endif LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); - - sum_abs0 = AND_VEC_W(src0); - sum_abs1 = AND_VEC_W(src1); - sum_abs2 = AND_VEC_W(src2); - sum_abs3 = AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - sum_abs1 += AND_VEC_W(src5); - sum_abs2 += AND_VEC_W(src6); - sum_abs3 += AND_VEC_W(src7); - } - else - { - sum_abs0 = zero_v; - sum_abs1 = zero_v; - sum_abs2 = zero_v; - sum_abs3 = zero_v; - } - - for (i = 0; i < (n >> 5); i++) - { - LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); + LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); @@ -79,13 +91,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); + sum_abs0 += AND_VEC_W(src8); + sum_abs1 += AND_VEC_W(src9); + sum_abs2 += AND_VEC_W(src10); + sum_abs3 += AND_VEC_W(src11); + sum_abs0 += AND_VEC_W(src12); + sum_abs1 += AND_VEC_W(src13); + sum_abs2 += AND_VEC_W(src14); + sum_abs3 += AND_VEC_W(src15); } - if (n & 31) + if (n & 63) { - if ((n & 16) && (n & 8) && (n & 4)) + if (n & 32) { - LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); + LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); @@ -94,65 +114,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; + sum_abs3 += AND_VEC_W(src7); } - else if ((n & 16) && (n & 8)) - { - LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - sum_abs3 += AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - sum_abs1 += AND_VEC_W(src5); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else if ((n & 16) && (n & 4)) - { - LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); - - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - sum_abs3 += AND_VEC_W(src3); - sum_abs0 += AND_VEC_W(src4); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else if ((n & 8) && (n & 4)) - { - LD_SP3_INC(x, 4, src0, src1, src2); - - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src1); - sum_abs2 += AND_VEC_W(src2); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else if (n & 16) + if (n & 16) { LD_SP4_INC(x, 4, src0, src1, src2, src3); @@ -160,173 +125,146 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; } - else if (n & 8) + + if (n & 8) { LD_SP2_INC(x, 4, src0, src1); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; } - else if (n & 4) + + if (n & 4) { src0 = LD_SP(x); x += 4; sum_abs0 += AND_VEC_W(src0); - - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - else - { - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; } if (n & 2) { - sumf += fabsf(*(x + 0)); + sumf += fabsf(*x); sumf += fabsf(*(x + 1)); x += 2; } if (n & 1) { - sumf += fabsf(*(x + 0)); + sumf += fabsf(*x); } } - else - { - sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; - sumf += sum_abs0[0]; - sumf += sum_abs0[1]; - sumf += sum_abs0[2]; - sumf += sum_abs0[3]; - } - } - else - { - if (n > 8) - { - n -= 8; - - src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); - x += inc_x; - - sum_abs0 = AND_VEC_W(src0); - sum_abs1 = AND_VEC_W(src4); - } - else - { - sum_abs0 = zero_v; - sum_abs1 = zero_v; - } - - for (i = (n >> 3); i--;) - { - src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); - x += inc_x; - src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); - x += inc_x; - - sum_abs0 += AND_VEC_W(src0); - sum_abs1 += AND_VEC_W(src4); - } - - if (n & 4) - { - src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); - x += inc_x; - src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); - x += inc_x; - - sum_abs0 += AND_VEC_W(src0); - } - - sum_abs0 += sum_abs1; + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf += sum_abs0[0]; sumf += sum_abs0[1]; sumf += sum_abs0[2]; sumf += sum_abs0[3]; - - if ((n & 2) && (n & 1)) + } + else + { + for (i = (n >> 4); i--;) { - data0 = fabsf(*x); x += inc_x; - data1 = fabsf(*x); x += inc_x; - data2 = fabsf(*x); + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x)); + x += inc_x; + src2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src2 = (v4f32) __msa_insert_w((v4i32) src2, 1, *((int *) x)); + x += inc_x; + src2 = (v4f32) __msa_insert_w((v4i32) src2, 2, *((int *) x)); + x += inc_x; + src2 = (v4f32) __msa_insert_w((v4i32) src2, 3, *((int *) x)); + x += inc_x; + src3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src3 = (v4f32) __msa_insert_w((v4i32) src3, 1, *((int *) x)); + x += inc_x; + src3 = (v4f32) __msa_insert_w((v4i32) src3, 2, *((int *) x)); + x += inc_x; + src3 = (v4f32) __msa_insert_w((v4i32) src3, 3, *((int *) x)); + x += inc_x; - sumf += data0; - sumf += data1; - sumf += data2; + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + sum_abs2 += AND_VEC_W(src2); + sum_abs3 += AND_VEC_W(src3); } - else if (n & 2) + + if (n & 15) { - data0 = fabsf(*x); x += inc_x; - data1 = fabsf(*x); + if (n & 8) + { + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x)); + x += inc_x; + src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x)); + x += inc_x; - sumf += data0; - sumf += data1; - } - else if (n & 1) - { - data0 = fabsf(*x); + sum_abs0 += AND_VEC_W(src0); + sum_abs1 += AND_VEC_W(src1); + } - sumf += data0; + if (n & 4) + { + src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); + x += inc_x; + src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); + x += inc_x; + + sum_abs0 += AND_VEC_W(src0); + } + + if (n & 2) + { + data0 = fabsf(*x); x += inc_x; + data1 = fabsf(*x); x += inc_x; + + sumf += data0; + sumf += data1; + } + + if (n & 1) + { + sumf += fabsf(*x); + } } + + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; + + sumf += sum_abs0[0]; + sumf += sum_abs0[1]; + sumf += sum_abs0[2]; + sumf += sum_abs0[3]; } return (sumf); diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c index 1997ec5a0..f281db349 100644 --- a/kernel/mips/sdot_msa.c +++ b/kernel/mips/sdot_msa.c @@ -28,7 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -/* return float, x,y float */ #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else @@ -37,96 +36,71 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i = 0; double dot = 0.0; - float x0, x1, x2, x3, y0, y1, y2, y3; + FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; v4f32 dot0 = {0, 0, 0, 0}; + v4f32 dot1 = {0, 0, 0, 0}; + v4f32 dot2 = {0, 0, 0, 0}; + v4f32 dot3 = {0, 0, 0, 0}; - if (n < 0) return (dot); + if (n < 1) return (dot); if ((1 == inc_x) && (1 == inc_y)) { for (i = (n >> 5); i--;) { - LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); - LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); + +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 256(%[x])\n\t" + "pref 0, 288(%[x])\n\t" + "pref 0, 320(%[x])\n\t" + "pref 0, 352(%[x])\n\t" + "pref 0, 256(%[y])\n\t" + "pref 0, 288(%[y])\n\t" + "pref 0, 320(%[y])\n\t" + "pref 0, 352(%[y])\n\t" + + : : [x] "r" (x), [y] "r" (y) + ); +#endif dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); + dot1 += (vy1 * vx1); + dot2 += (vy2 * vx2); + dot3 += (vy3 * vx3); dot0 += (vy4 * vx4); - dot0 += (vy5 * vx5); - dot0 += (vy6 * vx6); - dot0 += (vy7 * vx7); + dot1 += (vy5 * vx5); + dot2 += (vy6 * vx6); + dot3 += (vy7 * vx7); } if (n & 31) { - if ((n & 16) && (n & 8) && (n & 4)) + if (n & 16) { - LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6); - LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6); + LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); + LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); - dot0 += (vy4 * vx4); - dot0 += (vy5 * vx5); - dot0 += (vy6 * vx6); + dot1 += (vy1 * vx1); + dot2 += (vy2 * vx2); + dot3 += (vy3 * vx3); } - else if ((n & 16) && (n & 8)) + + if (n & 8) { - LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5); - LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5); + LD_SP2_INC(x, 4, vx0, vx1); + LD_SP2_INC(y, 4, vy0, vy1); dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); - dot0 += (vy4 * vx4); - dot0 += (vy5 * vx5); + dot1 += (vy1 * vx1); } - else if ((n & 16) && (n & 4)) - { - LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4); - LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4); - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); - dot0 += (vy4 * vx4); - } - else if ((n & 8) && (n & 4)) - { - LD_SP3_INC(x, 4, vx0, vx1, vx2); - LD_SP3_INC(y, 4, vy0, vy1, vy2); - - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - } - else if (n & 16) - { - LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); - LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); - - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - dot0 += (vy2 * vx2); - dot0 += (vy3 * vx3); - } - else if (n & 8) - { - LD_SP2_INC(x, 4, vx0, vx1); - LD_SP2_INC(y, 4, vy0, vy1); - - dot0 += (vy0 * vx0); - dot0 += (vy1 * vx1); - } - else if (n & 4) + if (n & 4) { vx0 = LD_SP(x); x += 4; vy0 = LD_SP(y); y += 4; @@ -134,16 +108,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot0 += (vy0 * vx0); } - if ((n & 2) && (n & 1)) - { - LD_GP3_INC(x, 1, x0, x1, x2); - LD_GP3_INC(y, 1, y0, y1, y2); - - dot += (y0 * x0); - dot += (y1 * x1); - dot += (y2 * x2); - } - else if (n & 2) + if (n & 2) { LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); @@ -151,7 +116,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += (y0 * x0); dot += (y1 * x1); } - else if (n & 1) + + if (n & 1) { x0 = *x; y0 = *y; @@ -160,6 +126,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } } + dot0 += dot1 + dot2 + dot3; + dot += dot0[0]; dot += dot0[1]; dot += dot0[2]; @@ -178,16 +146,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += (y3 * x3); } - if ((n & 2) && (n & 1)) - { - LD_GP3_INC(x, inc_x, x0, x1, x2); - LD_GP3_INC(y, inc_y, y0, y1, y2); - - dot += (y0 * x0); - dot += (y1 * x1); - dot += (y2 * x2); - } - else if (n & 2) + if (n & 2) { LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); @@ -195,7 +154,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += (y0 * x0); dot += (y1 * x1); } - else if (n & 1) + + if (n & 1) { x0 = *x; y0 = *y; diff --git a/kernel/mips/zasum_msa.c b/kernel/mips/zasum_msa.c index c84d48ecb..8c4f8d175 100644 --- a/kernel/mips/zasum_msa.c +++ b/kernel/mips/zasum_msa.c @@ -31,139 +31,191 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) -#define PROCESS_ZD(inc_val) \ - if (n > 8) \ - { \ - n -= 8; \ - \ - LD_DP8_INC(x, inc_val, src0, src1, src2, \ - src3, src4, src5, src6, src7); \ - \ - sum_abs0 = AND_VEC_D(src0); \ - sum_abs1 = AND_VEC_D(src1); \ - sum_abs2 = AND_VEC_D(src2); \ - sum_abs3 = AND_VEC_D(src3); \ - sum_abs0 += AND_VEC_D(src4); \ - sum_abs1 += AND_VEC_D(src5); \ - sum_abs2 += AND_VEC_D(src6); \ - sum_abs3 += AND_VEC_D(src7); \ - } \ - else \ - { \ - sum_abs0 = zero_v; \ - sum_abs1 = zero_v; \ - sum_abs2 = zero_v; \ - sum_abs3 = zero_v; \ - } \ - \ - for (i = (n >> 3); i--;) \ - { \ - LD_DP8_INC(x, inc_val, src0, src1, src2, \ - src3, src4, src5, src6, src7); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - sum_abs1 += AND_VEC_D(src1); \ - sum_abs2 += AND_VEC_D(src2); \ - sum_abs3 += AND_VEC_D(src3); \ - sum_abs0 += AND_VEC_D(src4); \ - sum_abs1 += AND_VEC_D(src5); \ - sum_abs2 += AND_VEC_D(src6); \ - sum_abs3 += AND_VEC_D(src7); \ - } \ - \ - if (n & 7) \ - { \ - if ((n & 4) && (n & 2) && (n & 1)) \ - { \ - LD_DP7_INC(x, inc_val, src0, src1, src2, \ - src3, src4, src5, src6); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - sum_abs1 += AND_VEC_D(src1); \ - sum_abs2 += AND_VEC_D(src2); \ - sum_abs3 += AND_VEC_D(src3); \ - sum_abs0 += AND_VEC_D(src4); \ - sum_abs1 += AND_VEC_D(src5); \ - sum_abs2 += AND_VEC_D(src6); \ - } \ - else if ((n & 4) && (n & 2)) \ - { \ - LD_DP6_INC(x, inc_val, src0, src1, src2, \ - src3, src4, src5); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - sum_abs1 += AND_VEC_D(src1); \ - sum_abs2 += AND_VEC_D(src2); \ - sum_abs3 += AND_VEC_D(src3); \ - sum_abs0 += AND_VEC_D(src4); \ - sum_abs1 += AND_VEC_D(src5); \ - } \ - else if ((n & 4) && (n & 1)) \ - { \ - LD_DP5_INC(x, inc_val, src0, src1, src2, \ - src3, src4); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - sum_abs1 += AND_VEC_D(src1); \ - sum_abs2 += AND_VEC_D(src2); \ - sum_abs3 += AND_VEC_D(src3); \ - sum_abs0 += AND_VEC_D(src4); \ - } \ - else if ((n & 2) && (n & 1)) \ - { \ - LD_DP3_INC(x, inc_val, src0, src1, src2); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - sum_abs1 += AND_VEC_D(src1); \ - sum_abs2 += AND_VEC_D(src2); \ - } \ - else if (n & 4) \ - { \ - LD_DP4_INC(x, inc_val, src0, src1, src2, \ - src3); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - sum_abs1 += AND_VEC_D(src1); \ - sum_abs2 += AND_VEC_D(src2); \ - sum_abs3 += AND_VEC_D(src3); \ - } \ - else if (n & 2) \ - { \ - LD_DP2_INC(x, inc_val, src0, src1); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - sum_abs1 += AND_VEC_D(src1); \ - } \ - else if (n & 1) \ - { \ - src0 = LD_DP(x); \ - \ - sum_abs0 += AND_VEC_D(src0); \ - } \ - } \ - \ - sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; \ - sumf = sum_abs0[0] + sum_abs0[1]; - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i; FLOAT sumf = 0.0; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; - v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; - v2f64 zero_v = {0}; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + v2f64 sum_abs0 = {0, 0}; + v2f64 sum_abs1 = {0, 0}; + v2f64 sum_abs2 = {0, 0}; + v2f64 sum_abs3 = {0, 0}; v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { - PROCESS_ZD(2); +#ifdef ENABLE_PREFETCH + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(FLOAT); + x_pref = x + pref_offset + 64; +#endif + + for (i = (n >> 4); i--;) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 0(%[x_pref])\n\t" + "pref 0, 32(%[x_pref])\n\t" + "pref 0, 64(%[x_pref])\n\t" + "pref 0, 96(%[x_pref])\n\t" + "pref 0, 128(%[x_pref])\n\t" + "pref 0, 160(%[x_pref])\n\t" + "pref 0, 192(%[x_pref])\n\t" + "pref 0, 224(%[x_pref])\n\t" + + : : [x_pref] "r" (x_pref) + ); + + x_pref += 32; +#endif + + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + + if (n & 4) + { + LD_DP4_INC(x, 2, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + } + + if (n & 2) + { + LD_DP2_INC(x, 2, src0, src1); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + } + + if (n & 1) + { + src0 = LD_DP(x); + + sum_abs0 += AND_VEC_D(src0); + } + } + + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; + sumf = sum_abs0[0] + sum_abs0[1]; } else { inc_x *= 2; - PROCESS_ZD(inc_x); + + for (i = (n >> 4); i--;) + { + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + sum_abs0 += AND_VEC_D(src8); + sum_abs1 += AND_VEC_D(src9); + sum_abs2 += AND_VEC_D(src10); + sum_abs3 += AND_VEC_D(src11); + sum_abs0 += AND_VEC_D(src12); + sum_abs1 += AND_VEC_D(src13); + sum_abs2 += AND_VEC_D(src14); + sum_abs3 += AND_VEC_D(src15); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + sum_abs0 += AND_VEC_D(src4); + sum_abs1 += AND_VEC_D(src5); + sum_abs2 += AND_VEC_D(src6); + sum_abs3 += AND_VEC_D(src7); + } + + if (n & 4) + { + LD_DP4_INC(x, inc_x, src0, src1, src2, src3); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + sum_abs2 += AND_VEC_D(src2); + sum_abs3 += AND_VEC_D(src3); + } + + if (n & 2) + { + LD_DP2_INC(x, inc_x, src0, src1); + + sum_abs0 += AND_VEC_D(src0); + sum_abs1 += AND_VEC_D(src1); + } + + if (n & 1) + { + src0 = LD_DP(x); + + sum_abs0 += AND_VEC_D(src0); + } + } + + sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; + sumf = sum_abs0[0] + sum_abs0[1]; } return (sumf); diff --git a/kernel/mips/zdot_msa.c b/kernel/mips/zdot_msa.c index 482c0cf05..f3c1847b4 100644 --- a/kernel/mips/zdot_msa.c +++ b/kernel/mips/zdot_msa.c @@ -29,195 +29,220 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "macros_msa.h" #if !defined(CONJ) - #define OP2 += - #define OP3 - - #define OP4 + + #define OP1 -= + #define OP2 += + #define OP3 - + #define OP4 + #else - #define OP2 -= - #define OP3 + - #define OP4 - + #define OP1 += + #define OP2 -= + #define OP3 + + #define OP4 - #endif -#define DOT16_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); \ - \ - dot0 += (vx1r * vy1r); \ - dot0 OPR0## = (vx1i * vy1i); \ - dot1 OPR1## = (vx1i * vy1r); \ - dot1 += (vx1r * vy1i); \ - \ - dot0 += (vx2r * vy2r); \ - dot0 OPR0## = (vx2i * vy2i); \ - dot1 OPR1## = (vx2i * vy2r); \ - dot1 += (vx2r * vy2i); \ - \ - dot0 += (vx3r * vy3r); \ - dot0 OPR0## = (vx3i * vy3i); \ - dot1 OPR1## = (vx3i * vy3r); \ - dot1 += (vx3r * vy3i); - -#define DOT12_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); \ - \ - dot0 += (vx1r * vy1r); \ - dot0 OPR0## = (vx1i * vy1i); \ - dot1 OPR1## = (vx1i * vy1r); \ - dot1 += (vx1r * vy1i); \ - \ - dot0 += (vx2r * vy2r); \ - dot0 OPR0## = (vx2i * vy2i); \ - dot1 OPR1## = (vx2i * vy2r); \ - dot1 += (vx2r * vy2i); - -#define DOT8_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); \ - \ - dot0 += (vx1r * vy1r); \ - dot0 OPR0## = (vx1i * vy1i); \ - dot1 OPR1## = (vx1i * vy1r); \ - dot1 += (vx1r * vy1i); - -#define DOT4_KERNEL(OPR0, OPR1) \ - dot0 += (vx0r * vy0r); \ - dot0 OPR0## = (vx0i * vy0i); \ - dot1 OPR1## = (vx0i * vy0r); \ - dot1 += (vx0r * vy0i); - -/* return double, x,y double */ -/* zdotc - CONJ */ -/* zdotu - !CONJ */ - OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i = 0; FLOAT dot[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; + BLASLONG inc_x2, inc_y2; v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; - v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; - v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; + v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; + v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; v2f64 dot0 = {0, 0}; v2f64 dot1 = {0, 0}; + v2f64 dot2 = {0, 0}; + v2f64 dot3 = {0, 0}; + v2f64 dot4 = {0, 0}; + v2f64 dot5 = {0, 0}; + v2f64 dot6 = {0, 0}; + v2f64 dot7 = {0, 0}; v2f64 zero = {0, 0}; - openblas_complex_double result; + OPENBLAS_COMPLEX_FLOAT result; dot[0] = 0.0; dot[1] = 0.0; - __real__(result) = 0.0; - __imag__(result) = 0.0; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; - if ( n < 1 ) return(result); + if (n < 1) return (result); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; - for (i = (n >> 3); i--;) - { - LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); - LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); - PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); - PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); - PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); - PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); +#ifdef ENABLE_PREFETCH + if ((1 == inc_x) && (1 == inc_y)) + { + double *x_pref, *y_pref; + BLASLONG pref_offset; - PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); - PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); - PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); - PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(double); + x_pref = x + pref_offset + 32; - #if !defined(CONJ) - DOT16_KERNEL(-, +); - #else - DOT16_KERNEL(+, -); - #endif - } + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + } + pref_offset = pref_offset / sizeof(double); + y_pref = y + pref_offset + 32; - if (n & 7) - { - if ((n & 4) && (n & 2)) - { - LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); - LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); - LD_DP2_INC(x, inc_x2, vx4, vx5); - LD_DP2_INC(y, inc_y2, vy4, vy5); + for (i = (n >> 3); i--;) + { + __asm__ __volatile__( + "pref 0, 0(%[x_pref])\n\t" + "pref 0, 32(%[x_pref])\n\t" + "pref 0, 64(%[x_pref])\n\t" + "pref 0, 96(%[x_pref])\n\t" + "pref 0, 0(%[y_pref])\n\t" + "pref 0, 32(%[y_pref])\n\t" + "pref 0, 64(%[y_pref])\n\t" + "pref 0, 96(%[y_pref])\n\t" - PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); - PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); - PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); + : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref) + ); - PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); - PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); - PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); + x_pref += 16; + y_pref += 16; - #if !defined(CONJ) - DOT12_KERNEL(-, +); - #else - DOT12_KERNEL(+, -); - #endif - } - else if (n & 4) - { - LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); - LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); + LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); - PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); - PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); + PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); - PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); - PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); + PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); - #if !defined(CONJ) - DOT8_KERNEL(-, +); - #else - DOT8_KERNEL(+, -); - #endif - } - else if (n & 2) - { - LD_DP2_INC(x, inc_x2, vx0, vx1); - LD_DP2_INC(y, inc_y2, vy0, vy1); - PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); - PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); - #if !defined(CONJ) - DOT4_KERNEL(-, +); - #else - DOT4_KERNEL(+, -); - #endif - } + dot2 += (vx1r * vy1r); + dot2 OP1 (vx1i * vy1i); + dot3 OP2 (vx1i * vy1r); + dot3 += (vx1r * vy1i); - if (n & 1) - { - vx0 = LD_DP(x); - vy0 = LD_DP(y); - PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i); - PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i); + dot4 += (vx2r * vy2r); + dot4 OP1 (vx2i * vy2i); + dot5 OP2 (vx2i * vy2r); + dot5 += (vx2r * vy2i); - #if !defined(CONJ) - DOT4_KERNEL(-, +); - #else - DOT4_KERNEL(+, -); - #endif - } - } + dot6 += (vx3r * vy3r); + dot6 OP1 (vx3i * vy3i); + dot7 OP2 (vx3i * vy3r); + dot7 += (vx3r * vy3i); + } + } + else +#endif + for (i = (n >> 3); i--;) + { + LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); + LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); - dot[0] += (dot0[0] + dot0[1]); - dot[1] += (dot1[0] + dot1[1]); + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); + PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); - __real__(result) = dot[0]; - __imag__(result) = dot[1]; + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); + PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); - return(result); + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); + + dot2 += (vx1r * vy1r); + dot2 OP1 (vx1i * vy1i); + dot3 OP2 (vx1i * vy1r); + dot3 += (vx1r * vy1i); + + dot4 += (vx2r * vy2r); + dot4 OP1 (vx2i * vy2i); + dot5 OP2 (vx2i * vy2r); + dot5 += (vx2r * vy2i); + + dot6 += (vx3r * vy3r); + dot6 OP1 (vx3i * vy3i); + dot7 OP2 (vx3i * vy3r); + dot7 += (vx3r * vy3i); + } + + if (n & 7) + { + if (n & 4) + { + LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); + LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); + + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); + + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); + + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); + + dot2 += (vx1r * vy1r); + dot2 OP1 (vx1i * vy1i); + dot3 OP2 (vx1i * vy1r); + dot3 += (vx1r * vy1i); + } + + if (n & 2) + { + LD_DP2_INC(x, inc_x2, vx0, vx1); + LD_DP2_INC(y, inc_y2, vy0, vy1); + PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); + + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); + } + + if (n & 1) + { + vx0 = LD_DP(x); + vy0 = LD_DP(y); + PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i); + PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i); + + dot0 += (vx0r * vy0r); + dot0 OP1 (vx0i * vy0i); + dot1 OP2 (vx0i * vy0r); + dot1 += (vx0r * vy0i); + } + } + + dot0 += dot2 + dot4 + dot6; + dot1 += dot3 + dot5 + dot7; + + dot[0] += (dot0[0] + dot0[1]); + dot[1] += (dot1[0] + dot1[1]); + + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + + return (result); }