diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c index cd1fa45b3..4b3637c7c 100644 --- a/kernel/mips/cgemm_kernel_8x4_msa.c +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -1082,7 +1082,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, - FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc + FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif @@ -1092,18 +1092,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #if defined(TRMMKERNEL) BLASLONG off; #endif - FLOAT *pc0, *pc1, *pc2, *pc3; - FLOAT *pa0, *pb0; + FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; FLOAT res0, res1, res2, res3, res4, res5, res6, res7; FLOAT res8, res9, res10, res11, res12, res13, res14, res15; - FLOAT a0_r, a1_r; - FLOAT a0_i, a1_i; + FLOAT a0_r, a1_r, a0_i, a1_i, b0_i, b1_i, b2_i, b3_i; FLOAT b0_r, b1_r, b2_r, b3_r; - FLOAT b0_i, b1_i, b2_i, b3_i; v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1; v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; - v4f32 dst0, dst1, dst2, dst3; - v4f32 alpha_r, alpha_i; + v4f32 dst0, dst1, dst2, dst3, alpha_r, alpha_i; v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; v4f32 dst0_r, dst0_i, dst1_r, dst1_i; @@ -1122,12 +1118,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, pc2 = pc1 + 2 * ldc; pc3 = pc2 + 2 * ldc; - pa0 = A; - #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif + pa0 = A; + for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) @@ -1150,6 +1146,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, temp = k; #endif +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[pa0]) \n\t" + "pref 0, 96(%[pa0]) \n\t" + "pref 0, 32(%[pb0]) \n\t" + + : + : [pa0] "r" (pa0), [pb0] "r" (pb0) + ); +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X4_MSA(, -, , +, +); #endif @@ -1165,6 +1172,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, for (l = (temp - 1); l--;) { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[pa0]) \n\t" + "pref 0, 96(%[pa0]) \n\t" + "pref 0, 32(%[pb0]) \n\t" + + : + : [pa0] "r" (pa0), [pb0] "r" (pb0) + ); +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X4_MSA(+, -, +, +,); #endif @@ -1340,6 +1358,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else CGEMM_SCALE_2X4 #endif + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1357,11 +1379,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 2; // number of values in A #endif #endif - - pc0 += 4; - pc1 += 4; - pc2 += 4; - pc3 += 4; } if (m & 1) @@ -1426,6 +1443,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else CGEMM_SCALE_1X4 #endif + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1443,21 +1464,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 1; // number of values in A #endif #endif - - pc0 += 2; - pc1 += 2; - pc2 += 2; - pc3 += 2; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif - l = k << 3; - B = B + l; - i = ldc << 3; - C = C + i; + B += (k << 3); + C += (ldc << 3); } if (n & 2) @@ -1465,12 +1479,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, pc0 = C; pc1 = pc0 + 2 * ldc; - pa0 = A; - #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif + pa0 = A; + for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) @@ -1691,6 +1705,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else CGEMM_SCALE_2X2 #endif + pc0 += 4; + pc1 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1708,9 +1724,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 2; // number of values in A #endif #endif - - pc0 += 4; - pc1 += 4; } if (m & 1) @@ -1775,6 +1788,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else CGEMM_SCALE_1X2 #endif + pc0 += 2; + pc1 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1792,30 +1807,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 1; // number of values in A #endif #endif - - pc0 += 2; - pc1 += 2; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif - l = k << 2; - B = B + l; - i = ldc << 2; - C = C + i; + B += (k << 2); + C += (ldc << 2); } if (n & 1) { pc0 = C; - pa0 = A; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif + pa0 = A; + for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) @@ -2036,6 +2047,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else CGEMM_SCALE_2X1 #endif + pc0 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2053,8 +2065,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 2; // number of values in A #endif #endif - - pc0 += 4; } if (m & 1) @@ -2119,6 +2129,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else CGEMM_SCALE_1X1 #endif + pc0 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2136,18 +2147,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 1; // number of values in A #endif #endif - - pc0 += 2; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 1; // number of values in A #endif - l = k << 1; - B = B + l; - i = ldc << 1; - C = C + i; + B += (k << 1); + C += (ldc << 1); } return 0; diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c index 76368276d..40f700014 100644 --- a/kernel/mips/dgemm_kernel_8x4_msa.c +++ b/kernel/mips/dgemm_kernel_8x4_msa.c @@ -28,19 +28,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -#define ENABLE_PREFETCH - -#ifdef ENABLE_PREFETCH -inline static void prefetch_load_lf(unsigned char *src) { - __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); -} -#endif - static void __attribute__ ((noinline)) -dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, - FLOAT *C, BLASLONG ldc) +dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, + FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) { - BLASLONG i, j, l; + BLASLONG i, j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; v2f64 v_alpha = {alpha, alpha}; v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1; @@ -48,6 +47,10 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT v2f64 res0, res1, res2, res3, res4, res5, res6, res7; v2f64 res8, res9, res10, res11, res12, res13, res14, res15; +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + for (j = (n >> 2); j--;) { pc0 = C; @@ -55,21 +58,47 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT pc2 = pc1 + ldc; pc3 = pc2 + ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + pa0 = A; for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif #ifdef ENABLE_PREFETCH - prefetch_load_lf((unsigned char *)(pa0 + 8)); - prefetch_load_lf((unsigned char *)(pa0 + 12)); - prefetch_load_lf((unsigned char *)(pa0 + 16)); - prefetch_load_lf((unsigned char *)(pa0 + 20)); + __asm__ __volatile__( + "pref 0, 64(%[pa0]) \n\t" + "pref 0, 96(%[pa0]) \n\t" + "pref 0, 128(%[pa0]) \n\t" + "pref 0, 160(%[pa0]) \n\t" + "pref 0, 32(%[pb0]) \n\t" + "pref 0, 64(%[pb0]) \n\t" + "pref 0, 96(%[pb0]) \n\t" - prefetch_load_lf((unsigned char *)(pb0 + 4)); - prefetch_load_lf((unsigned char *)(pb0 + 8)); - prefetch_load_lf((unsigned char *)(pb0 + 12)); + : + : [pa0] "r" (pa0), [pb0] "r" (pb0) + ); #endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); @@ -99,13 +128,18 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT res14 = src_a2 * src_b; res15 = src_a3 * src_b; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { #ifdef ENABLE_PREFETCH - prefetch_load_lf((unsigned char *)(pa0 + 16)); - prefetch_load_lf((unsigned char *)(pa0 + 20)); - prefetch_load_lf((unsigned char *)(pa0 + 24)); - prefetch_load_lf((unsigned char *)(pa0 + 28)); + __asm__ __volatile__( + "pref 0, 128(%[pa0]) \n\t" + "pref 0, 160(%[pa0]) \n\t" + "pref 0, 192(%[pa0]) \n\t" + "pref 0, 224(%[pa0]) \n\t" + + : + : [pa0] "r" (pa0) + ); #endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pb0, 2, src_b0, src_b1); @@ -136,8 +170,13 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); #ifdef ENABLE_PREFETCH - prefetch_load_lf((unsigned char *)(pb0 + 8)); - prefetch_load_lf((unsigned char *)(pb0 + 12)); + __asm__ __volatile__( + "pref 0, 64(%[pb0]) \n\t" + "pref 0, 96(%[pb0]) \n\t" + + : + : [pb0] "r" (pb0) + ); #endif LD_DP2_INC(pb0, 2, src_b0, src_b1); @@ -166,7 +205,7 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT res15 += src_a3 * src_b; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pb0, 2, src_b0, src_b1); @@ -197,11 +236,28 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT } #ifdef ENABLE_PREFETCH - prefetch_load_lf((unsigned char *)(pc0 + 8)); - prefetch_load_lf((unsigned char *)(pc1 + 8)); - prefetch_load_lf((unsigned char *)(pc2 + 8)); - prefetch_load_lf((unsigned char *)(pc3 + 8)); + __asm__ __volatile__( + "pref 0, 64(%[pc0]) \n\t" + "pref 0, 64(%[pc1]) \n\t" + "pref 0, 64(%[pc2]) \n\t" + "pref 0, 64(%[pc3]) \n\t" + + : + : [pc0] "r" (pc0), [pc1] "r" (pc1), + [pc2] "r" (pc2), [pc3] "r" (pc3) + ); #endif + +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); @@ -213,10 +269,20 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; - +#endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); +#if defined(TRMMKERNEL) + dst0 = res8 * v_alpha; + dst1 = res9 * v_alpha; + dst2 = res10 * v_alpha; + dst3 = res11 * v_alpha; + dst4 = res12 * v_alpha; + dst5 = res13 * v_alpha; + dst6 = res14 * v_alpha; + dst7 = res15 * v_alpha; +#else LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); @@ -228,23 +294,49 @@ dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; - +#endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } - l = (k << 2); - B = B + l; - i = (ldc << 2); - C = C + i; +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += (k << 2); + C += (ldc << 2); } } static void __attribute__ ((noinline)) -dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, - FLOAT *C, BLASLONG ldc) +dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, + FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) { - BLASLONG i, j, l; + BLASLONG j, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3; FLOAT a0, b0, b1, b2, b3; @@ -253,19 +345,93 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 res0, res1, res2, res3, res4, res5, res6, res7; +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + for (j = (n >> 2); j--;) { +#if defined(TRMMKERNEL) + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pa0 = A; + +#if defined(LEFT) + off = offset; +#endif + + for (l = (m >> 3); l--;) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 4; // number of values in B +#endif + + pc0 += 8; + pc1 += 8; + pc2 += 8; + pc3 += 8; + pa0 += 8 * temp; + pb0 += 4 * temp; + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif + } +#else // #if !defined(TRMMKERNEL) pc0 = C + 8 * (m >> 3); pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; pa0 = A + k * 8 * (m >> 3); +#endif if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif LD_DP2_INC(pa0, 2, src_a0, src_a1); LD_DP2_INC(pb0, 2, src_b0, src_b1); @@ -286,7 +452,7 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT res6 = src_a0 * src_b; res7 = src_a1 * src_b; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { LD_DP2_INC(pa0, 2, src_a0, src_a1); LD_DP2_INC(pb0, 2, src_b0, src_b1); @@ -327,7 +493,7 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT res7 += src_a1 * src_b; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { LD_DP2_INC(pa0, 2, src_a0, src_a1); LD_DP2_INC(pb0, 2, src_b0, src_b1); @@ -349,6 +515,16 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT res7 += src_a1 * src_b; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); LD_DP2(pc2, 2, dst4, dst5); @@ -362,16 +538,51 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; - +#endif ST_DP2_INC(dst0, dst1, pc0, 2); ST_DP2_INC(dst2, dst3, pc1, 2); ST_DP2_INC(dst4, dst5, pc2, 2); ST_DP2_INC(dst6, dst7, pc3, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); pa0 += 2; @@ -389,7 +600,7 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 = src_a0 * src_b; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); pa0 += 2; @@ -424,7 +635,7 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT res3 += src_a0 * src_b; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); pa0 += 2; @@ -443,6 +654,12 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT res3 += src_a0 * src_b; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst2 = LD_DP(pc2); @@ -452,7 +669,7 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); ST_DP(dst2, pc2); @@ -462,11 +679,46 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT pc1 += 2; pc2 += 2; pc3 += 2; + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 4; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 4; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -484,7 +736,7 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT pa0 += 1; pb0 += 4; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -519,7 +771,7 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT pb0 += 4; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -543,29 +795,61 @@ dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; + pc2[0] = tmp2; + pc3[0] = tmp3; +#else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; - +#endif pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } - l = (k << 2); - B = B + l; - i = (ldc << 2); - C = C + i; +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += (k << 2); + C += (ldc << 2); } } static void __attribute__ ((noinline)) -dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, - FLOAT *C, BLASLONG ldc) +dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, + FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) { - BLASLONG i, l; + BLASLONG i, l, temp; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif FLOAT *pc0, *pc1, *pa0, *pb0; FLOAT tmp0, tmp1; FLOAT a0, b0, b1; @@ -574,16 +858,42 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 res0, res1, res2, res3, res4, res5, res6, res7; +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset + (4 * (n >> 2)); +#endif + if (n & 2) { pc0 = C; pc1 = pc0 + ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + pa0 = A; for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); @@ -601,7 +911,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res6 = src_a2 * src_b; res7 = src_a3 * src_b; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); @@ -636,7 +946,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res7 += src_a3 * src_b; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); @@ -655,6 +965,16 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res7 += src_a3 * src_b; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; + dst4 = res4 * v_alpha; + dst5 = res5 * v_alpha; + dst6 = res6 * v_alpha; + dst7 = res7 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); @@ -666,14 +986,49 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; - +#endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); @@ -687,7 +1042,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res2 = src_a0 * src_b; res3 = src_a1 * src_b; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); @@ -714,7 +1069,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res3 += src_a1 * src_b; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); @@ -729,6 +1084,12 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res3 += src_a1 * src_b; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); @@ -736,14 +1097,49 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_DP2_INC(dst0, dst1, pc0, 2); ST_DP2_INC(dst2, dst3, pc1, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); pa0 += 2; @@ -756,7 +1152,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 = src_a0 * src_b; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); pa0 += 2; @@ -781,7 +1177,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res1 += src_a0 * src_b; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); pa0 += 2; @@ -795,22 +1191,61 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL res1 += src_a0 * src_b; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; - +#endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); pc0 += 2; pc1 += 2; + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 2; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -822,7 +1257,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pa0 += 1; pb0 += 2; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -845,7 +1280,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 2; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -861,27 +1296,73 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; +#if defined(TRMMKERNEL) + pc0[0] = tmp0; + pc1[0] = tmp1; +#else pc0[0] += tmp0; pc1[0] += tmp1; - +#endif pc0 += 1; pc1 += 1; + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + pa0 += temp * 1; + pb0 += temp * 2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } - l = (k << 1); - B = B + l; - i = (ldc << 1); - C = C + i; +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + B += (k << 1); + C += (ldc << 1); } if (n & 1) { pc0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + pa0 = A; for (i = (m >> 3); i--;) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 8; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 8; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; @@ -894,7 +1375,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; @@ -919,7 +1400,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; @@ -933,19 +1414,60 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; + dst2 = res2 * v_alpha; + dst3 = res3 * v_alpha; +#else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; - +#endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 8; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 8; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 8; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 4) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 4; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 4; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; @@ -956,7 +1478,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; @@ -977,7 +1499,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; @@ -989,17 +1511,56 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; + dst1 = res1 * v_alpha; +#else LD_DP2(pc0, 2, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; - +#endif ST_DP2_INC(dst0, dst1, pc0, 2); + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 4; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 2) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 2; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 2; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1010,7 +1571,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pa0 += 2; pb0 += 1; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1031,7 +1592,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; @@ -1043,18 +1604,56 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } +#if defined(TRMMKERNEL) + dst0 = res0 * v_alpha; +#else dst0 = LD_DP(pc0); dst0 += res0 * v_alpha; - +#endif ST_DP(dst0, pc0); pc0 += 2; + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = k - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + pa0 += temp * 2; + pb0 += temp * 1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif +#endif // #if defined(TRMMKERNEL) } if (m & 1) { +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; +#else + pa0 += off * 1; + pb0 = B + off * 1; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = k - off; +#elif defined(LEFT) + temp = off + 1; // number of values in A +#else + temp = off + 1; // number of values in B +#endif +#else // #if !defined(TRMMKERNEL) + pb0 = B; + temp = k; +#endif a0 = pa0[0]; b0 = pb0[0]; @@ -1063,7 +1662,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pa0 += 1; pb0 += 1; - for (l = ((k - 1) >> 1); l--;) + for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; @@ -1080,7 +1679,7 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } - if ((k - 1) & 1) + if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; @@ -1090,15 +1689,12 @@ dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FL pb0 += 1; } +#if defined(TRMMKERNEL) + pc0[0] = alpha * tmp0; +#else pc0[0] += alpha * tmp0; - - pc0 += 1; +#endif } - - l = (k << 0); - B = B + l; - i = (ldc << 0); - C = C + i; } } @@ -1112,10 +1708,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, if (n >> 2) { if (m >> 3) +#ifdef TRMMKERNEL + dgemmkernel_8x4_core_msa(m, n, k, alpha, A, B, C, ldc, offset); +#else dgemmkernel_8x4_core_msa(m, n, k, alpha, A, B, C, ldc); +#endif if (m & 7) +#ifdef TRMMKERNEL + dgemmkernel_7x4_core_msa(m, n, k, alpha, A, B, C, ldc, offset); +#else dgemmkernel_7x4_core_msa(m, n, k, alpha, A, B, C, ldc); +#endif } if (n & 3) @@ -1123,7 +1727,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, B = B + (k << 2) * (n >> 2); C = C + (ldc << 2) * (n >> 2); +#ifdef TRMMKERNEL + dgemmkernel_8x4_non_core_msa(m, n, k, alpha, A, B, C, ldc, offset); +#else dgemmkernel_8x4_non_core_msa(m, n, k, alpha, A, B, C, ldc); +#endif } return 0; diff --git a/kernel/mips/sgemm_kernel_8x8_msa.c b/kernel/mips/sgemm_kernel_8x8_msa.c index 1695471ad..bfd02f95b 100644 --- a/kernel/mips/sgemm_kernel_8x8_msa.c +++ b/kernel/mips/sgemm_kernel_8x8_msa.c @@ -91,6 +91,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 = B; temp = k; #endif +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 32(%[pa0]) \n\t" + "pref 0, 32(%[pb0]) \n\t" + + : + : [pa0] "r" (pa0), [pb0] "r" (pb0) + ); +#endif LD_SP2_INC(pa0, 4, src_a0, src_a1); LD_SP2_INC(pb0, 4, src_b0, src_b1); @@ -129,6 +138,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, for (l = ((temp - 1) >> 1); l--;) { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[pa0]) \n\t" + "pref 0, 96(%[pa0]) \n\t" + "pref 0, 64(%[pb0]) \n\t" + "pref 0, 96(%[pb0]) \n\t" + + : + : [pa0] "r" (pa0), [pb0] "r" (pb0) + ); +#endif + LD_SP2_INC(pa0, 4, src_a0, src_a1); LD_SP2_INC(pb0, 4, src_b0, src_b1); @@ -500,6 +521,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, ST_SP(dst2, pc6); ST_SP(dst3, pc7); + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + pc4 += 4; + pc5 += 4; + pc6 += 4; + pc7 += 4; + #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; @@ -516,15 +546,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 4; // number of values in A #endif #endif - - pc0 += 4; - pc1 += 4; - pc2 += 4; - pc3 += 4; - pc4 += 4; - pc5 += 4; - pc6 += 4; - pc7 += 4; } if (m & 2) @@ -763,6 +784,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc6[1] += tmp13; pc7[1] += tmp15; #endif + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + pc4 += 2; + pc5 += 2; + pc6 += 2; + pc7 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -780,15 +809,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 2; // number of values in A #endif #endif - - pc0 += 2; - pc1 += 2; - pc2 += 2; - pc3 += 2; - pc4 += 2; - pc5 += 2; - pc6 += 2; - pc7 += 2; } if (m & 1) @@ -959,6 +979,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc6[0] += tmp6; pc7[0] += tmp7; #endif + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + pc4 += 1; + pc5 += 1; + pc6 += 1; + pc7 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -976,24 +1004,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 1; // number of values in A #endif #endif - pc0 += 1; - pc1 += 1; - pc2 += 1; - pc3 += 1; - pc4 += 1; - pc5 += 1; - pc6 += 1; - pc7 += 1; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 8; // number of values in A #endif - l = (k << 3); - B = B + l; - i = (ldc << 3); - C = C + i; + B += (k << 3); + C += (ldc << 3); } if (n & 4) @@ -1003,12 +1021,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc2 = pc1 + ldc; pc3 = pc2 + ldc; - pa0 = A; - #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif + pa0 = A; + for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) @@ -1145,7 +1163,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; #endif - ST_SP2_INC(dst0, dst1, pc0, 4); ST_SP2_INC(dst2, dst3, pc1, 4); ST_SP2_INC(dst4, dst5, pc2, 4); @@ -1268,6 +1285,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pa0 += 4; pb0 += 4; } + #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; @@ -1289,6 +1307,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, ST_SP(dst2, pc2); ST_SP(dst3, pc3); + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; @@ -1305,10 +1328,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 4; // number of values in A #endif #endif - pc0 += 4; - pc1 += 4; - pc2 += 4; - pc3 += 4; } if (m & 2) @@ -1459,6 +1478,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc2[1] += tmp5; pc3[1] += tmp7; #endif + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1476,11 +1499,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 2; // number of values in A #endif #endif - - pc0 += 2; - pc1 += 2; - pc2 += 2; - pc3 += 2; } if (m & 1) @@ -1591,6 +1609,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc2[0] += tmp2; pc3[0] += tmp3; #endif + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1608,20 +1630,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 1; // number of values in A #endif #endif - pc0 += 1; - pc1 += 1; - pc2 += 1; - pc3 += 1; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif - l = (k << 2); - B = B + l; - i = (ldc << 2); - C = C + i; + B += (k << 2); + C += (ldc << 2); } if (n & 2) @@ -1629,12 +1645,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc0 = C; pc1 = pc0 + ldc; - pa0 = A; - #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif + pa0 = A; + for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) @@ -1847,6 +1863,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, ST_SP(dst0, pc0); ST_SP(dst1, pc1); + pc0 += 4; + pc1 += 4; + #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; @@ -1863,8 +1882,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 4; // number of values in A #endif #endif - pc0 += 4; - pc1 += 4; } if (m & 2) @@ -1967,6 +1984,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc0[1] += tmp1; pc1[1] += tmp3; #endif + pc0 += 2; + pc1 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1984,9 +2003,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 2; // number of values in A #endif #endif - - pc0 += 2; - pc1 += 2; } if (m & 1) @@ -2067,6 +2083,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc0[0] += tmp0; pc1[0] += tmp1; #endif + pc0 += 1; + pc1 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2084,28 +2102,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 1; // number of values in A #endif #endif - pc0 += 1; - pc1 += 1; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif - l = (k << 1); - B = B + l; - i = (ldc << 1); - C = C + i; + + B += (k << 1); + C += (ldc << 1); } if (n & 1) { pc0 = C; - pa0 = A; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif + pa0 = A; + for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) @@ -2272,6 +2288,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, #endif ST_SP(dst0, pc0); + pc0 += 4; + #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; @@ -2288,7 +2306,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 4; // number of values in A #endif #endif - pc0 += 4; } if (m & 2) @@ -2359,6 +2376,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pb0 += 1; } + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + #if defined(TRMMKERNEL) pc0[0] = tmp0; pc0[1] = tmp1; @@ -2366,6 +2386,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, pc0[0] += tmp0; pc0[1] += tmp1; #endif + pc0 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2383,8 +2404,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, off += 2; // number of values in A #endif #endif - - pc0 += 2; } if (m & 1) @@ -2448,34 +2467,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, #else pc0[0] += alpha * tmp0; #endif - -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = k - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 1; // number of values in B -#endif - pa0 += temp * 1; - pb0 += temp * 1; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif -#endif - - pc0 += 1; } - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; // number of values in A -#endif - l = (k << 0); - B = B + l; - i = (ldc << 0); - C = C + i; } return 0; diff --git a/kernel/mips/zgemm_kernel_4x4_msa.c b/kernel/mips/zgemm_kernel_4x4_msa.c index a185c69dd..96e56a305 100644 --- a/kernel/mips/zgemm_kernel_4x4_msa.c +++ b/kernel/mips/zgemm_kernel_4x4_msa.c @@ -851,6 +851,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, temp = k; #endif +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[pa0]) \n\t" + "pref 0, 96(%[pa0]) \n\t" + "pref 0, 64(%[pb0]) \n\t" + "pref 0, 96(%[pb0]) \n\t" + + : + : [pa0] "r" (pa0), [pb0] "r" (pb0) + ); +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X4_MSA(, -, , +, +); #endif @@ -866,6 +878,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, for (l = (temp - 1); l--;) { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[pa0]) \n\t" + "pref 0, 96(%[pa0]) \n\t" + "pref 0, 64(%[pb0]) \n\t" + "pref 0, 96(%[pb0]) \n\t" + + : + : [pa0] "r" (pa0), [pb0] "r" (pb0) + ); +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X4_MSA(+, -, +, +,); #endif @@ -1039,6 +1063,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else ZGEMM_SCALE_1X4_MSA #endif + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1056,21 +1084,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 1; // number of values in A #endif #endif - - pc0 += 2; - pc1 += 2; - pc2 += 2; - pc3 += 2; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif - l = k << 3; - B = B + l; - i = ldc << 3; - C = C + i; + B += (k << 3); + C += (ldc << 3); } if (n & 2) @@ -1294,6 +1315,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else ZGEMM_SCALE_1X2_MSA #endif + pc0 += 2; + pc1 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1311,19 +1334,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 1; // number of values in A #endif #endif - - pc0 += 2; - pc1 += 2; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif - l = k << 2; - B = B + l; - i = ldc << 2; - C = C + i; + B += (k << 2); + C += (ldc << 2); } if (n & 1) @@ -1555,6 +1573,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, #else ZGEMM_SCALE_1X1 #endif + pc0 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1572,18 +1591,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, off += 1; // number of values in A #endif #endif - - pc0 += 2; } #if defined(TRMMKERNEL) && !defined(LEFT) off += 1; // number of values in A #endif - l = k << 1; - B = B + l; - i = ldc << 1; - C = C + i; + B += (k << 1); + C += (ldc << 1); } + return 0; }