diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index dc21dab45..0169ff814 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +static __attribute__ ((noinline)) +void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; @@ -44,6 +45,26 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, -96(%[a]) \n\t" + "pref 0, -32(%[a]) \n\t" + "pref 0, -160(%[a]) \n\t" + "pref 0, -224(%[a]) \n\t" + "pref 0, -64(%[a]) \n\t" + "pref 0, -128(%[a]) \n\t" + "pref 0, -192(%[a]) \n\t" + "pref 0, -256(%[a]) \n\t" + "pref 0, -320(%[a]) \n\t" + "pref 0, -384(%[a]) \n\t" + "pref 0, -448(%[a]) \n\t" + "pref 0, -512(%[a]) \n\t" + + : + : [a] "r"(a) + ); +#endif + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); @@ -55,16 +76,25 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *pba = a, *pbb = b; v2f64 src_b, src_b0, src_b1, src_b2, src_b3; - LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pbb, 2, src_b0, src_b1); + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); - for (i = (bk - 1); i--;) + for (i = (bk - 1) >> 1; i--;) { - pba += 8; - pbb += 4; +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 128(%[pba]) \n\t" + "pref 0, 160(%[pba]) \n\t" + "pref 0, 192(%[pba]) \n\t" + "pref 0, 224(%[pba]) \n\t" - LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); - LD_DP2(pbb, 2, src_b2, src_b3); + : + : [pba] "r"(pba) + ); +#endif + + LD_DP4_INC(pba, 2, src_a8, src_a9, src_a16, src_a17); + LD_DP2_INC(pbb, 2, src_b2, src_b3); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -90,12 +120,62 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - src_a0 = src_a8; - src_a1 = src_a9; - src_a2 = src_a16; - src_a3 = src_a17; - src_b0 = src_b2; - src_b1 = src_b3; + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); + src_c0 -= src_a8 * src_b; + src_c1 -= src_a9 * src_b; + src_c2 -= src_a16 * src_b; + src_c3 -= src_a17 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); + src_c4 -= src_a8 * src_b; + src_c5 -= src_a9 * src_b; + src_c6 -= src_a16 * src_b; + src_c7 -= src_a17 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); + src_c8 -= src_a8 * src_b; + src_c9 -= src_a9 * src_b; + src_c10 -= src_a16 * src_b; + src_c11 -= src_a17 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); + src_c12 -= src_a8 * src_b; + src_c13 -= src_a9 * src_b; + src_c14 -= src_a16 * src_b; + src_c15 -= src_a17 * src_b; + } + + if ((bk - 1) & 1) + { + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); @@ -1180,7 +1260,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + (m - 1); - dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk); + dsolve_1x4_ln_msa(aa, bb, cc, ldc, (k - kk)); kk -= 1; } @@ -1191,7 +1271,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + ((m & -2) - 2); - dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk); + dsolve_2x4_ln_msa(aa, bb, cc, ldc, (k - kk)); kk -= 2; } @@ -1202,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, bb = b + 4 * kk; cc = c + ((m & -4) - 4); - dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk); + dsolve_4x4_ln_msa(aa, bb, cc, ldc, (k - kk)); kk -= 4; } @@ -1216,7 +1296,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, do { - dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk); + dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; @@ -1252,7 +1332,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -2) - 2) * k; cc = c + ((m & -2) - 2); - dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk); + dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, (k - kk)); kk -= 2; } @@ -1262,7 +1342,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); - dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk); + dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, (k - kk)); kk -= 4; } @@ -1276,7 +1356,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, do { - dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk); + dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; @@ -1310,7 +1390,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -2) - 2) * k + kk * 2; cc = c + ((m & -2) - 2); - dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk); + dsolve_2x1_ln_msa(aa, b + kk, cc, (k - kk)); kk -= 2; } @@ -1320,7 +1400,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); - dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk); + dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, (k - kk)); kk -= 4; } @@ -1334,7 +1414,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, do { - dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk); + dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk)); aa -= 8 * k; cc -= 8; diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 897fd313b..6406988e5 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +static __attribute__ ((noinline)) +void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; @@ -43,6 +44,28 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; +#ifdef ENABLE_PREFETCH + a += bk * 8; + __asm__ __volatile__( + "pref 0, (%[a]) \n\t" + "pref 0, 32(%[a]) \n\t" + "pref 0, 72(%[a]) \n\t" + "pref 0, 104(%[a]) \n\t" + "pref 0, 144(%[a]) \n\t" + "pref 0, 176(%[a]) \n\t" + "pref 0, 216(%[a]) \n\t" + "pref 0, 248(%[a]) \n\t" + "pref 0, 288(%[a]) \n\t" + "pref 0, 360(%[a]) \n\t" + "pref 0, 504(%[a]) \n\t" + "pref 0, 432(%[a]) \n\t" + + : + : [a] "r"(a) + ); + a -= bk * 8; +#endif + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); @@ -53,16 +76,25 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO BLASLONG i; v2f64 src_b, src_b0, src_b1, src_b2, src_b3; - LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(b, 2, src_b0, src_b1); + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); - for (i = (bk - 1); i--;) + for (i = ((bk - 1) >> 1); i--;) { - a += 8; - b += 4; +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 128(%[a]) \n\t" + "pref 0, 160(%[a]) \n\t" + "pref 0, 192(%[a]) \n\t" + "pref 0, 224(%[a]) \n\t" - LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); - LD_DP2(b, 2, src_b2, src_b3); + : + : [a] "r"(a) + ); +#endif + + LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2_INC(b, 2, src_b2, src_b3); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -88,12 +120,62 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - src_a0 = src_a4; - src_a1 = src_a5; - src_a2 = src_a6; - src_a3 = src_a7; - src_b0 = src_b2; - src_b1 = src_b3; + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); + src_c0 -= src_a4 * src_b; + src_c1 -= src_a5 * src_b; + src_c2 -= src_a6 * src_b; + src_c3 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); + src_c4 -= src_a4 * src_b; + src_c5 -= src_a5 * src_b; + src_c6 -= src_a6 * src_b; + src_c7 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); + src_c8 -= src_a4 * src_b; + src_c9 -= src_a5 * src_b; + src_c10 -= src_a6 * src_b; + src_c11 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); + src_c12 -= src_a4 * src_b; + src_c13 -= src_a5 * src_b; + src_c14 -= src_a6 * src_b; + src_c15 -= src_a7 * src_b; + } + + if ((bk - 1) & 1) + { + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); @@ -119,9 +201,6 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - - a += 8; - b += 4; } ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index 44313241e..e5343ce23 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +static __attribute__ ((noinline)) +void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; @@ -49,16 +50,25 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; v2f64 src_b; - LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(b, 2, src_b0, src_b1); + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); - for (i = (bk - 1); i--;) + for (i = ((bk - 1) >> 1); i--;) { - a += 8; - b += 4; +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 128(%[a]) \n\t" + "pref 0, 160(%[a]) \n\t" + "pref 0, 192(%[a]) \n\t" + "pref 0, 224(%[a]) \n\t" - LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); - LD_DP2(b, 2, src_b2, src_b3); + : + : [a] "r"(a) + ); +#endif + + LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2_INC(b, 2, src_b2, src_b3); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -84,12 +94,62 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - src_a0 = src_a4; - src_a1 = src_a5; - src_a2 = src_a6; - src_a3 = src_a7; - src_b0 = src_b2; - src_b1 = src_b3; + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); + src_c0 -= src_a4 * src_b; + src_c1 -= src_a5 * src_b; + src_c2 -= src_a6 * src_b; + src_c3 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); + src_c4 -= src_a4 * src_b; + src_c5 -= src_a5 * src_b; + src_c6 -= src_a6 * src_b; + src_c7 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); + src_c8 -= src_a4 * src_b; + src_c9 -= src_a5 * src_b; + src_c10 -= src_a6 * src_b; + src_c11 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); + src_c12 -= src_a4 * src_b; + src_c13 -= src_a5 * src_b; + src_c14 -= src_a6 * src_b; + src_c15 -= src_a7 * src_b; + } + + if ((bk - 1) & 1) + { + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(b, 2, src_b0, src_b1); } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); @@ -115,9 +175,6 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - - a += 8; - b += 4; } src_b0 = LD_DP(b + 0); diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 49274e5bc..837ba3e5b 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include "macros_msa.h" -static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +static __attribute__ ((noinline)) +void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; @@ -50,16 +51,24 @@ static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO v2f64 src_b, src_b0, src_b1, src_b2, src_b3; v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; - LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); - LD_DP2(pbb, 2, src_b0, src_b1); + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); - for (i = (bk - 1); i--;) + for (i = ((bk - 1) >> 1); i--;) { - pba += 8; - pbb += 4; +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 128(%[pba]) \n\t" + "pref 0, 160(%[pba]) \n\t" + "pref 0, 192(%[pba]) \n\t" + "pref 0, 224(%[pba]) \n\t" - LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); - LD_DP2(pbb, 2, src_b2, src_b3); + : + : [pba] "r"(pba) + ); +#endif + LD_DP4_INC(pba, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2_INC(pbb, 2, src_b2, src_b3); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; @@ -85,12 +94,62 @@ static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; - src_a0 = src_a4; - src_a1 = src_a5; - src_a2 = src_a6; - src_a3 = src_a7; - src_b0 = src_b2; - src_b1 = src_b3; + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2); + src_c0 -= src_a4 * src_b; + src_c1 -= src_a5 * src_b; + src_c2 -= src_a6 * src_b; + src_c3 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2); + src_c4 -= src_a4 * src_b; + src_c5 -= src_a5 * src_b; + src_c6 -= src_a6 * src_b; + src_c7 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3); + src_c8 -= src_a4 * src_b; + src_c9 -= src_a5 * src_b; + src_c10 -= src_a6 * src_b; + src_c11 -= src_a7 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3); + src_c12 -= src_a4 * src_b; + src_c13 -= src_a5 * src_b; + src_c14 -= src_a6 * src_b; + src_c15 -= src_a7 * src_b; + } + + if ((bk - 1) & 1) + { + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2_INC(pbb, 2, src_b0, src_b1); } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); @@ -881,7 +940,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (i = (m >> 3); i--;) { - dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk); + dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, (k - kk)); aa += 8 * k; cc += 8; @@ -891,7 +950,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk); + dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, (k - kk)); aa += 4 * k; cc += 4; @@ -899,7 +958,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk); + dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, (k - kk)); aa += 2 * k; cc += 2; @@ -907,7 +966,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk); + dsolve_1x1_rt_msa(aa + kk, bb, cc, (k - kk)); aa += k; cc += 1; @@ -928,7 +987,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (i = (m >> 3); i--;) { - dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk); + dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, (k - kk)); aa += 8 * k; cc += 8; @@ -938,7 +997,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk); + dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, (k - kk)); aa += 4 * k; cc += 4; @@ -946,7 +1005,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk); + dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, (k - kk)); aa += 2 * k; cc += 2; @@ -954,7 +1013,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk); + dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, (k - kk)); aa += k; cc += 1; @@ -975,7 +1034,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, for (i = (m >> 3); i--;) { - dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk); + dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, (k - kk)); aa += 8 * k; cc += 8; @@ -985,7 +1044,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, { if (m & 4) { - dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk); + dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, (k - kk)); aa += 4 * k; cc += 4; @@ -993,7 +1052,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 2) { - dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk); + dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, (k - kk)); aa += 2 * k; cc += 2; @@ -1001,7 +1060,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, if (m & 1) { - dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk); + dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, (k - kk)); aa += k; cc += 1; diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index dbc185302..5ef685278 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define ENABLE_PREFETCH + #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) #define LD_SP(...) LD_W(v4f32, __VA_ARGS__) diff --git a/kernel/mips/strsm_kernel_LN_8x8_msa.c b/kernel/mips/strsm_kernel_LN_8x8_msa.c index 53891e64f..1974a384b 100644 --- a/kernel/mips/strsm_kernel_LN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LN_8x8_msa.c @@ -30,9 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; @@ -59,34 +56,96 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); - for (k = 0; k < bk; k++) + if (bk > 0) { - LD_SP2(aa, 4, src_a0, src_a1); + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_bb0, src_bb1, src_b0, src_b1, src_b2, src_b3, src_a1; - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c0 -= src_a0 * src_b0; - src_c1 -= src_a1 * src_b0; - src_c2 -= src_a0 * src_b1; - src_c3 -= src_a1 * src_b1; - src_c4 -= src_a0 * src_b2; - src_c5 -= src_a1 * src_b2; - src_c6 -= src_a0 * src_b3; - src_c7 -= src_a1 * src_b3; + for (k = 0; k < (bk >> 1); k++) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[aa]) \n\t" + "pref 0, 96(%[aa]) \n\t" - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c8 -= src_a0 * src_b0; - src_c9 -= src_a1 * src_b0; - src_c10 -= src_a0 * src_b1; - src_c11 -= src_a1 * src_b1; - src_c12 -= src_a0 * src_b2; - src_c13 -= src_a1 * src_b2; - src_c14 -= src_a0 * src_b3; - src_c15 -= src_a1 * src_b3; + : + : [aa] "r" (aa) + ); +#endif - aa += 8; - bb += 8; + LD_SP2_INC(aa, 4, src_a0, src_a1); + LD_SP2_INC(bb, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + LD_SP2_INC(aa, 4, src_a0, src_a1); + LD_SP2_INC(bb, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } + + if (bk & 1) + { + LD_SP2(aa, 4, src_a0, src_a1); + LD_SP2(bb, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } } a -= 64; diff --git a/kernel/mips/strsm_kernel_LT_8x8_msa.c b/kernel/mips/strsm_kernel_LT_8x8_msa.c index 5834d77b2..547a55fe8 100644 --- a/kernel/mips/strsm_kernel_LT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_LT_8x8_msa.c @@ -30,8 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - BLASLONG k; - v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; @@ -58,34 +56,95 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); - for (k = 0; k < bk; k++) + if (bk > 0) { - LD_SP2(a, 4, src_a0, src_a1); + BLASLONG k; + v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1; - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c0 -= src_a0 * src_b0; - src_c1 -= src_a1 * src_b0; - src_c2 -= src_a0 * src_b1; - src_c3 -= src_a1 * src_b1; - src_c4 -= src_a0 * src_b2; - src_c5 -= src_a1 * src_b2; - src_c6 -= src_a0 * src_b3; - src_c7 -= src_a1 * src_b3; + for (k = 0; k < (bk >> 1); k++) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[a]) \n\t" + "pref 0, 96(%[a]) \n\t" - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c8 -= src_a0 * src_b0; - src_c9 -= src_a1 * src_b0; - src_c10 -= src_a0 * src_b1; - src_c11 -= src_a1 * src_b1; - src_c12 -= src_a0 * src_b2; - src_c13 -= src_a1 * src_b2; - src_c14 -= src_a0 * src_b3; - src_c15 -= src_a1 * src_b3; + : + : [a] "r" (a) + ); +#endif - a += 8; - b += 8; + LD_SP2_INC(a, 4, src_a0, src_a1); + LD_SP2_INC(b, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + LD_SP2_INC(a, 4, src_a0, src_a1); + LD_SP2_INC(b, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } + + if (bk & 1) + { + LD_SP2_INC(a, 4, src_a0, src_a1); + LD_SP2_INC(b, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } } TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, diff --git a/kernel/mips/strsm_kernel_RN_8x8_msa.c b/kernel/mips/strsm_kernel_RN_8x8_msa.c index 642ee3757..d1d02d5e3 100644 --- a/kernel/mips/strsm_kernel_RN_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RN_8x8_msa.c @@ -30,8 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - BLASLONG k; - v4f32 src_a0, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; @@ -56,34 +54,94 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); - for (k = 0; k < bk; k++) + if (bk > 0) { - LD_SP2(a, 4, src_a0, src_a1); + BLASLONG k; + v4f32 src_a0, src_a1, src_bb0, src_bb1; - src_b = LD_SP(b + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c0 -= src_a0 * src_b0; - src_c1 -= src_a1 * src_b0; - src_c2 -= src_a0 * src_b1; - src_c3 -= src_a1 * src_b1; - src_c4 -= src_a0 * src_b2; - src_c5 -= src_a1 * src_b2; - src_c6 -= src_a0 * src_b3; - src_c7 -= src_a1 * src_b3; + for (k = 0; k < (bk >> 1); k++) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[a]) \n\t" + "pref 0, 96(%[a]) \n\t" - src_b = LD_SP(b + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c8 -= src_a0 * src_b0; - src_c9 -= src_a1 * src_b0; - src_c10 -= src_a0 * src_b1; - src_c11 -= src_a1 * src_b1; - src_c12 -= src_a0 * src_b2; - src_c13 -= src_a1 * src_b2; - src_c14 -= src_a0 * src_b3; - src_c15 -= src_a1 * src_b3; + : + : [a] "r" (a) + ); +#endif + LD_SP2_INC(a, 4, src_a0, src_a1); + LD_SP2_INC(b, 4, src_bb0, src_bb1); - a += 8; - b += 8; + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + LD_SP2_INC(a, 4, src_a0, src_a1); + LD_SP2_INC(b, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } + + if (bk & 1) + { + LD_SP2_INC(a, 4, src_a0, src_a1); + LD_SP2_INC(b, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } } src_b = LD_SP(b + 0); diff --git a/kernel/mips/strsm_kernel_RT_8x8_msa.c b/kernel/mips/strsm_kernel_RT_8x8_msa.c index 21e41c8fb..6d3904660 100644 --- a/kernel/mips/strsm_kernel_RT_8x8_msa.c +++ b/kernel/mips/strsm_kernel_RT_8x8_msa.c @@ -30,9 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { - BLASLONG k; - FLOAT *aa = a, *bb = b; - v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; @@ -57,34 +54,96 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); - for (k = 0; k < bk; k++) + if (bk > 0) { - LD_SP2(aa, 4, src_a0, src_a1); + BLASLONG k; + FLOAT *aa = a, *bb = b; + v4f32 src_a0, src_a1, src_b1, src_b2, src_b3, src_bb0, src_bb1; - src_b = LD_SP(bb + 0); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c0 -= src_a0 * src_b0; - src_c1 -= src_a1 * src_b0; - src_c2 -= src_a0 * src_b1; - src_c3 -= src_a1 * src_b1; - src_c4 -= src_a0 * src_b2; - src_c5 -= src_a1 * src_b2; - src_c6 -= src_a0 * src_b3; - src_c7 -= src_a1 * src_b3; + for (k = 0; k < (bk >> 1); k++) + { +#ifdef ENABLE_PREFETCH + __asm__ __volatile__( + "pref 0, 64(%[aa]) \n\t" + "pref 0, 96(%[aa]) \n\t" - src_b = LD_SP(bb + 4); - SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); - src_c8 -= src_a0 * src_b0; - src_c9 -= src_a1 * src_b0; - src_c10 -= src_a0 * src_b1; - src_c11 -= src_a1 * src_b1; - src_c12 -= src_a0 * src_b2; - src_c13 -= src_a1 * src_b2; - src_c14 -= src_a0 * src_b3; - src_c15 -= src_a1 * src_b3; + : + : [aa] "r" (aa) + ); +#endif - aa += 8; - bb += 8; + LD_SP2_INC(aa, 4, src_a0, src_a1); + LD_SP2_INC(bb, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + + LD_SP2_INC(aa, 4, src_a0, src_a1); + LD_SP2_INC(bb, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } + + if (bk & 1) + { + LD_SP2(aa, 4, src_a0, src_a1); + LD_SP2(bb, 4, src_bb0, src_bb1); + + SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); + src_c0 -= src_a0 * src_b0; + src_c1 -= src_a1 * src_b0; + src_c2 -= src_a0 * src_b1; + src_c3 -= src_a1 * src_b1; + src_c4 -= src_a0 * src_b2; + src_c5 -= src_a1 * src_b2; + src_c6 -= src_a0 * src_b3; + src_c7 -= src_a1 * src_b3; + + SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); + src_c8 -= src_a0 * src_b0; + src_c9 -= src_a1 * src_b0; + src_c10 -= src_a0 * src_b1; + src_c11 -= src_a1 * src_b1; + src_c12 -= src_a0 * src_b2; + src_c13 -= src_a1 * src_b2; + src_c14 -= src_a0 * src_b3; + src_c15 -= src_a1 * src_b3; + } } b -= 64;