Merge pull request #986 from ksraste/develop
SGEMM, DGEMM, CGEMM, ZGEMM functions data prefetch
This commit is contained in:
commit
e54b6ddaa0
|
@ -1092,18 +1092,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#if defined(TRMMKERNEL)
|
||||
BLASLONG off;
|
||||
#endif
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3;
|
||||
FLOAT *pa0, *pb0;
|
||||
FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
|
||||
FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
|
||||
FLOAT a0_r, a1_r;
|
||||
FLOAT a0_i, a1_i;
|
||||
FLOAT a0_r, a1_r, a0_i, a1_i, b0_i, b1_i, b2_i, b3_i;
|
||||
FLOAT b0_r, b1_r, b2_r, b3_r;
|
||||
FLOAT b0_i, b1_i, b2_i, b3_i;
|
||||
v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
|
||||
v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
|
||||
v4f32 dst0, dst1, dst2, dst3;
|
||||
v4f32 alpha_r, alpha_i;
|
||||
v4f32 dst0, dst1, dst2, dst3, alpha_r, alpha_i;
|
||||
v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
|
||||
v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
|
||||
v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
|
||||
|
@ -1122,12 +1118,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
pc2 = pc1 + 2 * ldc;
|
||||
pc3 = pc2 + 2 * ldc;
|
||||
|
||||
pa0 = A;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
pa0 = A;
|
||||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -1150,6 +1146,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
temp = k;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PREFETCH
|
||||
__asm__ __volatile__(
|
||||
"pref 0, 64(%[pa0]) \n\t"
|
||||
"pref 0, 96(%[pa0]) \n\t"
|
||||
"pref 0, 32(%[pb0]) \n\t"
|
||||
|
||||
:
|
||||
: [pa0] "r" (pa0), [pb0] "r" (pb0)
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
CGEMM_KERNEL_8X4_MSA(, -, , +, +);
|
||||
#endif
|
||||
|
@ -1165,6 +1172,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
|
||||
for (l = (temp - 1); l--;)
|
||||
{
|
||||
#ifdef ENABLE_PREFETCH
|
||||
__asm__ __volatile__(
|
||||
"pref 0, 64(%[pa0]) \n\t"
|
||||
"pref 0, 96(%[pa0]) \n\t"
|
||||
"pref 0, 32(%[pb0]) \n\t"
|
||||
|
||||
:
|
||||
: [pa0] "r" (pa0), [pb0] "r" (pb0)
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
|
||||
#endif
|
||||
|
@ -1340,6 +1358,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
CGEMM_SCALE_2X4
|
||||
#endif
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
pc2 += 4;
|
||||
pc3 += 4;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1357,11 +1379,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 2; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
pc2 += 4;
|
||||
pc3 += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
@ -1426,6 +1443,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
CGEMM_SCALE_1X4
|
||||
#endif
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1443,21 +1464,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
l = k << 3;
|
||||
B = B + l;
|
||||
i = ldc << 3;
|
||||
C = C + i;
|
||||
B += (k << 3);
|
||||
C += (ldc << 3);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -1465,12 +1479,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
pc0 = C;
|
||||
pc1 = pc0 + 2 * ldc;
|
||||
|
||||
pa0 = A;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
pa0 = A;
|
||||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -1691,6 +1705,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
CGEMM_SCALE_2X2
|
||||
#endif
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1708,9 +1724,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 2; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
@ -1775,6 +1788,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
CGEMM_SCALE_1X2
|
||||
#endif
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1792,30 +1807,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
l = k << 2;
|
||||
B = B + l;
|
||||
i = ldc << 2;
|
||||
C = C + i;
|
||||
B += (k << 2);
|
||||
C += (ldc << 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
pc0 = C;
|
||||
pa0 = A;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
pa0 = A;
|
||||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -2036,6 +2047,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
CGEMM_SCALE_2X1
|
||||
#endif
|
||||
pc0 += 4;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -2053,8 +2065,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 2; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
@ -2119,6 +2129,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
CGEMM_SCALE_1X1
|
||||
#endif
|
||||
pc0 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -2136,18 +2147,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
l = k << 1;
|
||||
B = B + l;
|
||||
i = ldc << 1;
|
||||
C = C + i;
|
||||
B += (k << 1);
|
||||
C += (ldc << 1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -91,6 +91,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pb0 = B;
|
||||
temp = k;
|
||||
#endif
|
||||
#ifdef ENABLE_PREFETCH
|
||||
__asm__ __volatile__(
|
||||
"pref 0, 32(%[pa0]) \n\t"
|
||||
"pref 0, 32(%[pb0]) \n\t"
|
||||
|
||||
:
|
||||
: [pa0] "r" (pa0), [pb0] "r" (pb0)
|
||||
);
|
||||
#endif
|
||||
|
||||
LD_SP2_INC(pa0, 4, src_a0, src_a1);
|
||||
LD_SP2_INC(pb0, 4, src_b0, src_b1);
|
||||
|
@ -129,6 +138,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
|
||||
for (l = ((temp - 1) >> 1); l--;)
|
||||
{
|
||||
#ifdef ENABLE_PREFETCH
|
||||
__asm__ __volatile__(
|
||||
"pref 0, 64(%[pa0]) \n\t"
|
||||
"pref 0, 96(%[pa0]) \n\t"
|
||||
"pref 0, 64(%[pb0]) \n\t"
|
||||
"pref 0, 96(%[pb0]) \n\t"
|
||||
|
||||
:
|
||||
: [pa0] "r" (pa0), [pb0] "r" (pb0)
|
||||
);
|
||||
#endif
|
||||
|
||||
LD_SP2_INC(pa0, 4, src_a0, src_a1);
|
||||
LD_SP2_INC(pb0, 4, src_b0, src_b1);
|
||||
|
||||
|
@ -500,6 +521,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
ST_SP(dst2, pc6);
|
||||
ST_SP(dst3, pc7);
|
||||
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
pc2 += 4;
|
||||
pc3 += 4;
|
||||
pc4 += 4;
|
||||
pc5 += 4;
|
||||
pc6 += 4;
|
||||
pc7 += 4;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = k - off;
|
||||
|
@ -516,15 +546,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 4; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
pc2 += 4;
|
||||
pc3 += 4;
|
||||
pc4 += 4;
|
||||
pc5 += 4;
|
||||
pc6 += 4;
|
||||
pc7 += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
|
@ -763,6 +784,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc6[1] += tmp13;
|
||||
pc7[1] += tmp15;
|
||||
#endif
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
pc4 += 2;
|
||||
pc5 += 2;
|
||||
pc6 += 2;
|
||||
pc7 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -780,15 +809,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 2; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
pc4 += 2;
|
||||
pc5 += 2;
|
||||
pc6 += 2;
|
||||
pc7 += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
@ -959,6 +979,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc6[0] += tmp6;
|
||||
pc7[0] += tmp7;
|
||||
#endif
|
||||
pc0 += 1;
|
||||
pc1 += 1;
|
||||
pc2 += 1;
|
||||
pc3 += 1;
|
||||
pc4 += 1;
|
||||
pc5 += 1;
|
||||
pc6 += 1;
|
||||
pc7 += 1;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -976,24 +1004,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
pc0 += 1;
|
||||
pc1 += 1;
|
||||
pc2 += 1;
|
||||
pc3 += 1;
|
||||
pc4 += 1;
|
||||
pc5 += 1;
|
||||
pc6 += 1;
|
||||
pc7 += 1;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 8; // number of values in A
|
||||
#endif
|
||||
|
||||
l = (k << 3);
|
||||
B = B + l;
|
||||
i = (ldc << 3);
|
||||
C = C + i;
|
||||
B += (k << 3);
|
||||
C += (ldc << 3);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
|
@ -1003,12 +1021,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc2 = pc1 + ldc;
|
||||
pc3 = pc2 + ldc;
|
||||
|
||||
pa0 = A;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
pa0 = A;
|
||||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -1145,7 +1163,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
dst6 += res6 * v_alpha;
|
||||
dst7 += res7 * v_alpha;
|
||||
#endif
|
||||
|
||||
ST_SP2_INC(dst0, dst1, pc0, 4);
|
||||
ST_SP2_INC(dst2, dst3, pc1, 4);
|
||||
ST_SP2_INC(dst4, dst5, pc2, 4);
|
||||
|
@ -1268,6 +1285,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pa0 += 4;
|
||||
pb0 += 4;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
dst0 = res0 * v_alpha;
|
||||
dst1 = res1 * v_alpha;
|
||||
|
@ -1289,6 +1307,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
ST_SP(dst2, pc2);
|
||||
ST_SP(dst3, pc3);
|
||||
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
pc2 += 4;
|
||||
pc3 += 4;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = k - off;
|
||||
|
@ -1305,10 +1328,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 4; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
pc2 += 4;
|
||||
pc3 += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
|
@ -1459,6 +1478,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc2[1] += tmp5;
|
||||
pc3[1] += tmp7;
|
||||
#endif
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1476,11 +1499,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 2; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
@ -1591,6 +1609,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc2[0] += tmp2;
|
||||
pc3[0] += tmp3;
|
||||
#endif
|
||||
pc0 += 1;
|
||||
pc1 += 1;
|
||||
pc2 += 1;
|
||||
pc3 += 1;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1608,20 +1630,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
pc0 += 1;
|
||||
pc1 += 1;
|
||||
pc2 += 1;
|
||||
pc3 += 1;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
l = (k << 2);
|
||||
B = B + l;
|
||||
i = (ldc << 2);
|
||||
C = C + i;
|
||||
B += (k << 2);
|
||||
C += (ldc << 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -1629,12 +1645,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc0 = C;
|
||||
pc1 = pc0 + ldc;
|
||||
|
||||
pa0 = A;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
pa0 = A;
|
||||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -1847,6 +1863,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
ST_SP(dst0, pc0);
|
||||
ST_SP(dst1, pc1);
|
||||
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = k - off;
|
||||
|
@ -1863,8 +1882,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 4; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
pc0 += 4;
|
||||
pc1 += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
|
@ -1967,6 +1984,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc0[1] += tmp1;
|
||||
pc1[1] += tmp3;
|
||||
#endif
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1984,9 +2003,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 2; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
@ -2067,6 +2083,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc0[0] += tmp0;
|
||||
pc1[0] += tmp1;
|
||||
#endif
|
||||
pc0 += 1;
|
||||
pc1 += 1;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -2084,28 +2102,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
pc0 += 1;
|
||||
pc1 += 1;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
l = (k << 1);
|
||||
B = B + l;
|
||||
i = (ldc << 1);
|
||||
C = C + i;
|
||||
|
||||
B += (k << 1);
|
||||
C += (ldc << 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
pc0 = C;
|
||||
pa0 = A;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
pa0 = A;
|
||||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -2272,6 +2288,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
#endif
|
||||
ST_SP(dst0, pc0);
|
||||
|
||||
pc0 += 4;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = k - off;
|
||||
|
@ -2288,7 +2306,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 4; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
pc0 += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
|
@ -2359,6 +2376,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pb0 += 1;
|
||||
}
|
||||
|
||||
tmp0 = alpha * tmp0;
|
||||
tmp1 = alpha * tmp1;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
pc0[0] = tmp0;
|
||||
pc0[1] = tmp1;
|
||||
|
@ -2366,6 +2386,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
pc0[0] += tmp0;
|
||||
pc0[1] += tmp1;
|
||||
#endif
|
||||
pc0 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -2383,8 +2404,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
off += 2; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
@ -2448,34 +2467,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
|
|||
#else
|
||||
pc0[0] += alpha * tmp0;
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = k - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
pa0 += temp * 1;
|
||||
pb0 += temp * 1;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 1;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
l = (k << 0);
|
||||
B = B + l;
|
||||
i = (ldc << 0);
|
||||
C = C + i;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -851,6 +851,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
temp = k;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PREFETCH
|
||||
__asm__ __volatile__(
|
||||
"pref 0, 64(%[pa0]) \n\t"
|
||||
"pref 0, 96(%[pa0]) \n\t"
|
||||
"pref 0, 64(%[pb0]) \n\t"
|
||||
"pref 0, 96(%[pb0]) \n\t"
|
||||
|
||||
:
|
||||
: [pa0] "r" (pa0), [pb0] "r" (pb0)
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
ZGEMM_KERNEL_4X4_MSA(, -, , +, +);
|
||||
#endif
|
||||
|
@ -866,6 +878,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
|
||||
for (l = (temp - 1); l--;)
|
||||
{
|
||||
#ifdef ENABLE_PREFETCH
|
||||
__asm__ __volatile__(
|
||||
"pref 0, 64(%[pa0]) \n\t"
|
||||
"pref 0, 96(%[pa0]) \n\t"
|
||||
"pref 0, 64(%[pb0]) \n\t"
|
||||
"pref 0, 96(%[pb0]) \n\t"
|
||||
|
||||
:
|
||||
: [pa0] "r" (pa0), [pb0] "r" (pb0)
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
ZGEMM_KERNEL_4X4_MSA(+, -, +, +,);
|
||||
#endif
|
||||
|
@ -1039,6 +1063,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
ZGEMM_SCALE_1X4_MSA
|
||||
#endif
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1056,21 +1084,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
pc2 += 2;
|
||||
pc3 += 2;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
l = k << 3;
|
||||
B = B + l;
|
||||
i = ldc << 3;
|
||||
C = C + i;
|
||||
B += (k << 3);
|
||||
C += (ldc << 3);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -1294,6 +1315,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
ZGEMM_SCALE_1X2_MSA
|
||||
#endif
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1311,19 +1334,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
pc1 += 2;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
l = k << 2;
|
||||
B = B + l;
|
||||
i = ldc << 2;
|
||||
C = C + i;
|
||||
B += (k << 2);
|
||||
C += (ldc << 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -1555,6 +1573,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
#else
|
||||
ZGEMM_SCALE_1X1
|
||||
#endif
|
||||
pc0 += 2;
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
|
@ -1572,18 +1591,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
|
|||
off += 1; // number of values in A
|
||||
#endif
|
||||
#endif
|
||||
|
||||
pc0 += 2;
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
l = k << 1;
|
||||
B = B + l;
|
||||
i = ldc << 1;
|
||||
C = C + i;
|
||||
B += (k << 1);
|
||||
C += (ldc << 1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue