STRSM, DTRSM functions data prefetch

Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
This commit is contained in:
kaustubh 2016-10-14 16:41:28 +05:30
parent ef52a9266b
commit 90e2321ac3
9 changed files with 702 additions and 190 deletions

View File

@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include "macros_msa.h"
static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
static __attribute__ ((noinline))
void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
@ -44,6 +45,26 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
FLOAT *c_nxt2line = c + 2 * ldc;
FLOAT *c_nxt3line = c + 3 * ldc;
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, -96(%[a]) \n\t"
"pref 0, -32(%[a]) \n\t"
"pref 0, -160(%[a]) \n\t"
"pref 0, -224(%[a]) \n\t"
"pref 0, -64(%[a]) \n\t"
"pref 0, -128(%[a]) \n\t"
"pref 0, -192(%[a]) \n\t"
"pref 0, -256(%[a]) \n\t"
"pref 0, -320(%[a]) \n\t"
"pref 0, -384(%[a]) \n\t"
"pref 0, -448(%[a]) \n\t"
"pref 0, -512(%[a]) \n\t"
:
: [a] "r"(a)
);
#endif
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
@ -55,16 +76,25 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
FLOAT *pba = a, *pbb = b;
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2(pbb, 2, src_b0, src_b1);
LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(pbb, 2, src_b0, src_b1);
for (i = (bk - 1); i--;)
for (i = (bk - 1) >> 1; i--;)
{
pba += 8;
pbb += 4;
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 128(%[pba]) \n\t"
"pref 0, 160(%[pba]) \n\t"
"pref 0, 192(%[pba]) \n\t"
"pref 0, 224(%[pba]) \n\t"
LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
LD_DP2(pbb, 2, src_b2, src_b3);
:
: [pba] "r"(pba)
);
#endif
LD_DP4_INC(pba, 2, src_a8, src_a9, src_a16, src_a17);
LD_DP2_INC(pbb, 2, src_b2, src_b3);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
@ -90,12 +120,62 @@ static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
src_a0 = src_a8;
src_a1 = src_a9;
src_a2 = src_a16;
src_a3 = src_a17;
src_b0 = src_b2;
src_b1 = src_b3;
LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(pbb, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
src_c0 -= src_a8 * src_b;
src_c1 -= src_a9 * src_b;
src_c2 -= src_a16 * src_b;
src_c3 -= src_a17 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
src_c4 -= src_a8 * src_b;
src_c5 -= src_a9 * src_b;
src_c6 -= src_a16 * src_b;
src_c7 -= src_a17 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
src_c8 -= src_a8 * src_b;
src_c9 -= src_a9 * src_b;
src_c10 -= src_a16 * src_b;
src_c11 -= src_a17 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
src_c12 -= src_a8 * src_b;
src_c13 -= src_a9 * src_b;
src_c14 -= src_a16 * src_b;
src_c15 -= src_a17 * src_b;
}
if ((bk - 1) & 1)
{
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c2 -= src_a2 * src_b;
src_c3 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c4 -= src_a0 * src_b;
src_c5 -= src_a1 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
src_c8 -= src_a0 * src_b;
src_c9 -= src_a1 * src_b;
src_c10 -= src_a2 * src_b;
src_c11 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
src_c12 -= src_a0 * src_b;
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(pbb, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
@ -1180,7 +1260,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
bb = b + 4 * kk;
cc = c + (m - 1);
dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk);
dsolve_1x4_ln_msa(aa, bb, cc, ldc, (k - kk));
kk -= 1;
}
@ -1191,7 +1271,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
bb = b + 4 * kk;
cc = c + ((m & -2) - 2);
dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk);
dsolve_2x4_ln_msa(aa, bb, cc, ldc, (k - kk));
kk -= 2;
}
@ -1202,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
bb = b + 4 * kk;
cc = c + ((m & -4) - 4);
dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk);
dsolve_4x4_ln_msa(aa, bb, cc, ldc, (k - kk));
kk -= 4;
}
@ -1216,7 +1296,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
do
{
dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk);
dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
aa -= 8 * k;
cc -= 8;
@ -1252,7 +1332,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -2) - 2) * k;
cc = c + ((m & -2) - 2);
dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk);
dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, (k - kk));
kk -= 2;
}
@ -1262,7 +1342,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk);
dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, (k - kk));
kk -= 4;
}
@ -1276,7 +1356,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
do
{
dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk);
dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, (k - kk));
aa -= 8 * k;
cc -= 8;
@ -1310,7 +1390,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -2) - 2) * k + kk * 2;
cc = c + ((m & -2) - 2);
dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk);
dsolve_2x1_ln_msa(aa, b + kk, cc, (k - kk));
kk -= 2;
}
@ -1320,7 +1400,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk);
dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, (k - kk));
kk -= 4;
}
@ -1334,7 +1414,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
do
{
dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk);
dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk));
aa -= 8 * k;
cc -= 8;

View File

@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include "macros_msa.h"
static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
static __attribute__ ((noinline))
void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
@ -43,6 +44,28 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
FLOAT *c_nxt2line = c + 2 * ldc;
FLOAT *c_nxt3line = c + 3 * ldc;
#ifdef ENABLE_PREFETCH
a += bk * 8;
__asm__ __volatile__(
"pref 0, (%[a]) \n\t"
"pref 0, 32(%[a]) \n\t"
"pref 0, 72(%[a]) \n\t"
"pref 0, 104(%[a]) \n\t"
"pref 0, 144(%[a]) \n\t"
"pref 0, 176(%[a]) \n\t"
"pref 0, 216(%[a]) \n\t"
"pref 0, 248(%[a]) \n\t"
"pref 0, 288(%[a]) \n\t"
"pref 0, 360(%[a]) \n\t"
"pref 0, 504(%[a]) \n\t"
"pref 0, 432(%[a]) \n\t"
:
: [a] "r"(a)
);
a -= bk * 8;
#endif
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
@ -53,16 +76,25 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
BLASLONG i;
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2(b, 2, src_b0, src_b1);
LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(b, 2, src_b0, src_b1);
for (i = (bk - 1); i--;)
for (i = ((bk - 1) >> 1); i--;)
{
a += 8;
b += 4;
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 128(%[a]) \n\t"
"pref 0, 160(%[a]) \n\t"
"pref 0, 192(%[a]) \n\t"
"pref 0, 224(%[a]) \n\t"
LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
LD_DP2(b, 2, src_b2, src_b3);
:
: [a] "r"(a)
);
#endif
LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7);
LD_DP2_INC(b, 2, src_b2, src_b3);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
@ -88,12 +120,62 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
src_a0 = src_a4;
src_a1 = src_a5;
src_a2 = src_a6;
src_a3 = src_a7;
src_b0 = src_b2;
src_b1 = src_b3;
LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(b, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
src_c0 -= src_a4 * src_b;
src_c1 -= src_a5 * src_b;
src_c2 -= src_a6 * src_b;
src_c3 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
src_c4 -= src_a4 * src_b;
src_c5 -= src_a5 * src_b;
src_c6 -= src_a6 * src_b;
src_c7 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
src_c8 -= src_a4 * src_b;
src_c9 -= src_a5 * src_b;
src_c10 -= src_a6 * src_b;
src_c11 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
src_c12 -= src_a4 * src_b;
src_c13 -= src_a5 * src_b;
src_c14 -= src_a6 * src_b;
src_c15 -= src_a7 * src_b;
}
if ((bk - 1) & 1)
{
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c2 -= src_a2 * src_b;
src_c3 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c4 -= src_a0 * src_b;
src_c5 -= src_a1 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
src_c8 -= src_a0 * src_b;
src_c9 -= src_a1 * src_b;
src_c10 -= src_a2 * src_b;
src_c11 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
src_c12 -= src_a0 * src_b;
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(b, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
@ -119,9 +201,6 @@ static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
a += 8;
b += 4;
}
ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);

View File

@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include "macros_msa.h"
static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
static __attribute__ ((noinline))
void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
@ -49,16 +50,25 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
v2f64 src_b;
LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2(b, 2, src_b0, src_b1);
LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(b, 2, src_b0, src_b1);
for (i = (bk - 1); i--;)
for (i = ((bk - 1) >> 1); i--;)
{
a += 8;
b += 4;
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 128(%[a]) \n\t"
"pref 0, 160(%[a]) \n\t"
"pref 0, 192(%[a]) \n\t"
"pref 0, 224(%[a]) \n\t"
LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
LD_DP2(b, 2, src_b2, src_b3);
:
: [a] "r"(a)
);
#endif
LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7);
LD_DP2_INC(b, 2, src_b2, src_b3);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
@ -84,12 +94,62 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
src_a0 = src_a4;
src_a1 = src_a5;
src_a2 = src_a6;
src_a3 = src_a7;
src_b0 = src_b2;
src_b1 = src_b3;
LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(b, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
src_c0 -= src_a4 * src_b;
src_c1 -= src_a5 * src_b;
src_c2 -= src_a6 * src_b;
src_c3 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
src_c4 -= src_a4 * src_b;
src_c5 -= src_a5 * src_b;
src_c6 -= src_a6 * src_b;
src_c7 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
src_c8 -= src_a4 * src_b;
src_c9 -= src_a5 * src_b;
src_c10 -= src_a6 * src_b;
src_c11 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
src_c12 -= src_a4 * src_b;
src_c13 -= src_a5 * src_b;
src_c14 -= src_a6 * src_b;
src_c15 -= src_a7 * src_b;
}
if ((bk - 1) & 1)
{
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c2 -= src_a2 * src_b;
src_c3 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c4 -= src_a0 * src_b;
src_c5 -= src_a1 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
src_c8 -= src_a0 * src_b;
src_c9 -= src_a1 * src_b;
src_c10 -= src_a2 * src_b;
src_c11 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
src_c12 -= src_a0 * src_b;
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(b, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
@ -115,9 +175,6 @@ static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
a += 8;
b += 4;
}
src_b0 = LD_DP(b + 0);

View File

@ -28,7 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include "macros_msa.h"
static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
static __attribute__ ((noinline))
void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
@ -50,16 +51,24 @@ static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2(pbb, 2, src_b0, src_b1);
LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(pbb, 2, src_b0, src_b1);
for (i = (bk - 1); i--;)
for (i = ((bk - 1) >> 1); i--;)
{
pba += 8;
pbb += 4;
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 128(%[pba]) \n\t"
"pref 0, 160(%[pba]) \n\t"
"pref 0, 192(%[pba]) \n\t"
"pref 0, 224(%[pba]) \n\t"
LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
LD_DP2(pbb, 2, src_b2, src_b3);
:
: [pba] "r"(pba)
);
#endif
LD_DP4_INC(pba, 2, src_a4, src_a5, src_a6, src_a7);
LD_DP2_INC(pbb, 2, src_b2, src_b3);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
@ -85,12 +94,62 @@ static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
src_a0 = src_a4;
src_a1 = src_a5;
src_a2 = src_a6;
src_a3 = src_a7;
src_b0 = src_b2;
src_b1 = src_b3;
LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(pbb, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
src_c0 -= src_a4 * src_b;
src_c1 -= src_a5 * src_b;
src_c2 -= src_a6 * src_b;
src_c3 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
src_c4 -= src_a4 * src_b;
src_c5 -= src_a5 * src_b;
src_c6 -= src_a6 * src_b;
src_c7 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
src_c8 -= src_a4 * src_b;
src_c9 -= src_a5 * src_b;
src_c10 -= src_a6 * src_b;
src_c11 -= src_a7 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
src_c12 -= src_a4 * src_b;
src_c13 -= src_a5 * src_b;
src_c14 -= src_a6 * src_b;
src_c15 -= src_a7 * src_b;
}
if ((bk - 1) & 1)
{
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c2 -= src_a2 * src_b;
src_c3 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
src_c4 -= src_a0 * src_b;
src_c5 -= src_a1 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
src_c8 -= src_a0 * src_b;
src_c9 -= src_a1 * src_b;
src_c10 -= src_a2 * src_b;
src_c11 -= src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
src_c12 -= src_a0 * src_b;
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2_INC(pbb, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
@ -881,7 +940,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
for (i = (m >> 3); i--;)
{
dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk);
dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, (k - kk));
aa += 8 * k;
cc += 8;
@ -891,7 +950,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (m & 4)
{
dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk);
dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, (k - kk));
aa += 4 * k;
cc += 4;
@ -899,7 +958,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 2)
{
dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk);
dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, (k - kk));
aa += 2 * k;
cc += 2;
@ -907,7 +966,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 1)
{
dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk);
dsolve_1x1_rt_msa(aa + kk, bb, cc, (k - kk));
aa += k;
cc += 1;
@ -928,7 +987,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
for (i = (m >> 3); i--;)
{
dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk);
dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, (k - kk));
aa += 8 * k;
cc += 8;
@ -938,7 +997,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (m & 4)
{
dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk);
dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, (k - kk));
aa += 4 * k;
cc += 4;
@ -946,7 +1005,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 2)
{
dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk);
dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, (k - kk));
aa += 2 * k;
cc += 2;
@ -954,7 +1013,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 1)
{
dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk);
dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, (k - kk));
aa += k;
cc += 1;
@ -975,7 +1034,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
for (i = (m >> 3); i--;)
{
dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk);
dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, (k - kk));
aa += 8 * k;
cc += 8;
@ -985,7 +1044,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{
if (m & 4)
{
dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk);
dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, (k - kk));
aa += 4 * k;
cc += 4;
@ -993,7 +1052,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 2)
{
dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk);
dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, (k - kk));
aa += 2 * k;
cc += 2;
@ -1001,7 +1060,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (m & 1)
{
dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk);
dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, (k - kk));
aa += k;
cc += 1;

View File

@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <msa.h>
#define ENABLE_PREFETCH
#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
#define LD_SP(...) LD_W(v4f32, __VA_ARGS__)

View File

@ -30,9 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
BLASLONG k;
FLOAT *aa = a, *bb = b;
v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
@ -59,34 +56,96 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
for (k = 0; k < bk; k++)
if (bk > 0)
{
LD_SP2(aa, 4, src_a0, src_a1);
BLASLONG k;
FLOAT *aa = a, *bb = b;
v4f32 src_bb0, src_bb1, src_b0, src_b1, src_b2, src_b3, src_a1;
src_b = LD_SP(bb + 0);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
for (k = 0; k < (bk >> 1); k++)
{
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 64(%[aa]) \n\t"
"pref 0, 96(%[aa]) \n\t"
src_b = LD_SP(bb + 4);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
:
: [aa] "r" (aa)
);
#endif
aa += 8;
bb += 8;
LD_SP2_INC(aa, 4, src_a0, src_a1);
LD_SP2_INC(bb, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
LD_SP2_INC(aa, 4, src_a0, src_a1);
LD_SP2_INC(bb, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
if (bk & 1)
{
LD_SP2(aa, 4, src_a0, src_a1);
LD_SP2(bb, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
}
a -= 64;

View File

@ -30,8 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
BLASLONG k;
v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
@ -58,34 +56,95 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
for (k = 0; k < bk; k++)
if (bk > 0)
{
LD_SP2(a, 4, src_a0, src_a1);
BLASLONG k;
v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1;
src_b = LD_SP(b + 0);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
for (k = 0; k < (bk >> 1); k++)
{
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 64(%[a]) \n\t"
"pref 0, 96(%[a]) \n\t"
src_b = LD_SP(b + 4);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
:
: [a] "r" (a)
);
#endif
a += 8;
b += 8;
LD_SP2_INC(a, 4, src_a0, src_a1);
LD_SP2_INC(b, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
LD_SP2_INC(a, 4, src_a0, src_a1);
LD_SP2_INC(b, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
if (bk & 1)
{
LD_SP2_INC(a, 4, src_a0, src_a1);
LD_SP2_INC(b, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
}
TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,

View File

@ -30,8 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
BLASLONG k;
v4f32 src_a0, src_a1;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
@ -56,34 +54,94 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
for (k = 0; k < bk; k++)
if (bk > 0)
{
LD_SP2(a, 4, src_a0, src_a1);
BLASLONG k;
v4f32 src_a0, src_a1, src_bb0, src_bb1;
src_b = LD_SP(b + 0);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
for (k = 0; k < (bk >> 1); k++)
{
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 64(%[a]) \n\t"
"pref 0, 96(%[a]) \n\t"
src_b = LD_SP(b + 4);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
:
: [a] "r" (a)
);
#endif
LD_SP2_INC(a, 4, src_a0, src_a1);
LD_SP2_INC(b, 4, src_bb0, src_bb1);
a += 8;
b += 8;
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
LD_SP2_INC(a, 4, src_a0, src_a1);
LD_SP2_INC(b, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
if (bk & 1)
{
LD_SP2_INC(a, 4, src_a0, src_a1);
LD_SP2_INC(b, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
}
src_b = LD_SP(b + 0);

View File

@ -30,9 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
BLASLONG k;
FLOAT *aa = a, *bb = b;
v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
@ -57,34 +54,96 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
for (k = 0; k < bk; k++)
if (bk > 0)
{
LD_SP2(aa, 4, src_a0, src_a1);
BLASLONG k;
FLOAT *aa = a, *bb = b;
v4f32 src_a0, src_a1, src_b1, src_b2, src_b3, src_bb0, src_bb1;
src_b = LD_SP(bb + 0);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
for (k = 0; k < (bk >> 1); k++)
{
#ifdef ENABLE_PREFETCH
__asm__ __volatile__(
"pref 0, 64(%[aa]) \n\t"
"pref 0, 96(%[aa]) \n\t"
src_b = LD_SP(bb + 4);
SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
:
: [aa] "r" (aa)
);
#endif
aa += 8;
bb += 8;
LD_SP2_INC(aa, 4, src_a0, src_a1);
LD_SP2_INC(bb, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
LD_SP2_INC(aa, 4, src_a0, src_a1);
LD_SP2_INC(bb, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
if (bk & 1)
{
LD_SP2(aa, 4, src_a0, src_a1);
LD_SP2(bb, 4, src_bb0, src_bb1);
SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
src_c2 -= src_a0 * src_b1;
src_c3 -= src_a1 * src_b1;
src_c4 -= src_a0 * src_b2;
src_c5 -= src_a1 * src_b2;
src_c6 -= src_a0 * src_b3;
src_c7 -= src_a1 * src_b3;
SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
src_c8 -= src_a0 * src_b0;
src_c9 -= src_a1 * src_b0;
src_c10 -= src_a0 * src_b1;
src_c11 -= src_a1 * src_b1;
src_c12 -= src_a0 * src_b2;
src_c13 -= src_a1 * src_b2;
src_c14 -= src_a0 * src_b3;
src_c15 -= src_a1 * src_b3;
}
}
b -= 64;