POWER10: Change dgemm unroll factors

Changing the unroll factors for dgemm to 8 shows improved performance with
POWER10 MMA feature.   Also made some minor changes in sgemm for edge cases.
This commit is contained in:
Rajalakshmi Srinivasaraghavan 2020-10-31 18:28:57 -05:00
parent aa21cb5217
commit dd7a9cc5bf
5 changed files with 589 additions and 298 deletions

View File

@ -34,12 +34,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_power10.c
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = dgemm_ncopy_4_power8.S
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = dgemm_ncopy_8_power10.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
@ -69,7 +69,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
#endif
)
{
BLASLONG N = n;
BLASLONG i1;
#if defined(TRMMKERNEL)
BLASLONG off;
@ -158,10 +157,221 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
off = -offset;
#endif
v4sf_t valpha = { alpha, alpha };
N = n >> 2;
for (i1 = 0; i1 < N; i1++)
for (i1 = 0; i1 < (n >> 3); i1++)
{
BLASLONG i, j, temp;
BLASLONG j, temp;
FLOAT *CO;
FLOAT *AO;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
CO = C;
C += ldc << 3;
AO = A;
PREFETCH1 (A, 128);
PREFETCH1 (A, 256);
for (j = 0; j < (m >> 3); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 8);
#else
BO = B;
temp = k;
#endif
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
vec_t *rb = (vec_t *) & BO[0];
__vector_pair rowB, rowB1;
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
__builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
__builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
__builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
__builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 3];
rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
__builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
__builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
}
SAVE_ACC (&acc0, 0);
SAVE_ACC1 (&acc1, 0);
SAVE_ACC (&acc2, 2);
SAVE_ACC1 (&acc3, 2);
SAVE_ACC (&acc4, 4);
SAVE_ACC1 (&acc5, 4);
SAVE_ACC (&acc6, 6);
SAVE_ACC1 (&acc7, 6);
CO += 8;
AO += temp << 3;
BO += temp << 3;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 8)
#endif
}
if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 8);
#else
BO = B;
temp = k;
#endif
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 2];
rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
}
SAVE_ACC (&acc0, 0);
SAVE_ACC1 (&acc1, 0);
SAVE_ACC (&acc2, 2);
SAVE_ACC1 (&acc3, 2);
CO += 4;
AO += temp << 2;
BO += temp << 3;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 8)
#endif
}
if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 8);
#else
BO = B;
temp = k;
#endif
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1;
BLASLONG l = 0;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 1];
rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
}
SAVE_ACC (&acc0, 0);
SAVE_ACC1 (&acc1, 0);
CO += 2;
AO += temp << 1;
BO += temp << 3;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 8)
#endif
}
if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 8);
#else
BO = B;
temp = k;
#endif
BLASLONG l = 0;
v4sf_t t = { 0, 0 };
v4sf_t t1 = { 0, 0 };
v4sf_t t2 = { 0, 0 };
v4sf_t t3 = { 0, 0 };
for (l = 0; l < temp; l++)
{
v4sf_t rowA = { AO[l], AO[l] };
v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
t += rowA * rowB;
t1 += rowA * rowB1;
t2 += rowA * rowB2;
t3 += rowA * rowB3;
}
t = t * valpha;
t1 = t1 * valpha;
t2 = t2 * valpha;
t3 = t3 * valpha;
#if defined(TRMMKERNEL)
CO[0 * ldc] = t[0];
CO[1 * ldc] = t[1];
CO[2 * ldc] = t1[0];
CO[3 * ldc] = t1[1];
CO[4 * ldc] = t2[0];
CO[5 * ldc] = t2[1];
CO[6 * ldc] = t3[0];
CO[7 * ldc] = t3[1];
#else
CO[0 * ldc] += t[0];
CO[1 * ldc] += t[1];
CO[2 * ldc] += t1[0];
CO[3 * ldc] += t1[1];
CO[4 * ldc] += t2[0];
CO[5 * ldc] += t2[1];
CO[6 * ldc] += t3[0];
CO[7 * ldc] += t3[1];
#endif
CO += 1;
AO += temp;
BO += temp << 3;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 8)
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 8; // number of values in A
#endif
B += k << 3;
}
if (n & 4)
{
BLASLONG j, temp;
FLOAT *CO;
FLOAT *AO;
#if defined(TRMMKERNEL) && defined(LEFT)
@ -172,71 +382,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO = A;
PREFETCH1 (A, 128);
PREFETCH1 (A, 256);
i = m >> 4;
for (j = 0; j < i; j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (16, 4);
#else
BO = B;
temp = k;
#endif
v4sf_t *rowC;
v4sf_t result[4];
BLASLONG l = 0;
PREFETCH1 (CO, 0);
PREFETCH1 (CO + ldc, 0);
PREFETCH1 (CO + ldc + ldc, 0);
PREFETCH1 (CO + ldc + ldc + ldc, 0);
PREFETCH1 (CO, 128);
PREFETCH1 (CO + ldc, 128);
PREFETCH1 (CO + ldc + ldc, 128);
PREFETCH1 (CO + ldc + ldc + ldc, 128);
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 4];
rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
}
SAVE_ACC (&acc0, 0);
SAVE_ACC (&acc2, 4);
SAVE_ACC (&acc1, 2);
SAVE_ACC (&acc3, 6);
SAVE_ACC (&acc4, 8);
SAVE_ACC (&acc6, 12);
SAVE_ACC (&acc5, 10);
SAVE_ACC (&acc7, 14);
AO += temp << 4;
BO += temp << 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (16, 4)
#endif
CO += 16;
}
i = (m & 15) >> 3;
for (j = 0; j < i; j++)
for (j = 0; j < (m >> 3); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (8, 4)
#endif
}
i = (m & 7) >> 2;
for (j = 0; j < i; j++)
if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (4, 4)
#endif
}
i = (m & 3) >> 1;
for (j = 0; j < i; j++)
if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (2, 4)
#endif
}
i = (m & 1) >> 0;
for (j = 0; j < i; j++)
if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
#endif
B += k << 2;
}
N = (n & 3) >> 1;
for (i1 = 0; i1 < N; i1++)
if (n & 2)
{
BLASLONG i, j, temp;
BLASLONG j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
CO = C;
C += ldc << 1;
AO = A;
i = m >> 4;
for (j = 0; j < i; j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (16, 2);
#else
BO = B;
temp = k;
#endif
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
BLASLONG l = 0;
FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB;
vec_t *rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
for (l = 1; l < temp; l++)
{
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
rowA = (vec_t *) & AO[l << 4];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
}
SAVE2x4_ACC (&acc0, 0);
SAVE2x4_ACC (&acc1, 2);
SAVE2x4_ACC (&acc2, 4);
SAVE2x4_ACC (&acc3, 6);
SAVE2x4_ACC (&acc4, 8);
SAVE2x4_ACC (&acc5, 10);
SAVE2x4_ACC (&acc6, 12);
SAVE2x4_ACC (&acc7, 14);
CO += 16;
AO += temp << 4;
BO += temp << 1;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (16, 2)
#endif
}
i = (m & 15) >> 3;
for (j = 0; j < i; j++)
for (j = 0; j < (m >> 3); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (8, 2)
#endif
}
i = (m & 7) >> 2;
for (j = 0; j < i; j++)
if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (4, 2)
#endif
}
i = (m & 3) >> 1;
for (j = 0; j < i; j++)
if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (2, 2)
#endif
}
i = (m & 1) >> 0;
for (j = 0; j < i; j++)
if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
#endif
B += k << 1;
}
N = (n & 1) >> 0;
for (i1 = 0; i1 < N; i1++)
if (n & 1)
{
BLASLONG i, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
CO = C;
C += ldc;
AO = A;
i = m;
while (i >= 16)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (16, 1)
#else
BO = B;
temp = k;
#endif
BLASLONG l = 0;
v4sf_t t = { 0, 0 };
v4sf_t t1 = { 0, 0 };
v4sf_t t2 = { 0, 0 };
v4sf_t t3 = { 0, 0 };
v4sf_t t4 = { 0, 0 };
v4sf_t t5 = { 0, 0 };
v4sf_t t6 = { 0, 0 };
v4sf_t t7 = { 0, 0 };
for (l = 0; l < temp; l++)
{
v4sf_t rowB = { BO[l], BO[l] };
v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
t += rowA * rowB;
t1 += rowA1 * rowB;
t2 += rowA2 * rowB;
t3 += rowA3 * rowB;
t4 += rowA4 * rowB;
t5 += rowA5 * rowB;
t6 += rowA6 * rowB;
t7 += rowA7 * rowB;
}
t = t * valpha;
t1 = t1 * valpha;
t2 = t2 * valpha;
t3 = t3 * valpha;
t4 = t4 * valpha;
t5 = t5 * valpha;
t6 = t6 * valpha;
t7 = t7 * valpha;
#if defined(TRMMKERNEL)
CO[0] = t[0];
CO[1] = t[1];
CO[2] = t1[0];
CO[3] = t1[1];
CO[4] = t2[0];
CO[5] = t2[1];
CO[6] = t3[0];
CO[7] = t3[1];
CO[8] = t4[0];
CO[9] = t4[1];
CO[10] = t5[0];
CO[11] = t5[1];
CO[12] = t6[0];
CO[13] = t6[1];
CO[14] = t7[0];
CO[15] = t7[1];
#else
CO[0] += t[0];
CO[1] += t[1];
CO[2] += t1[0];
CO[3] += t1[1];
CO[4] += t2[0];
CO[5] += t2[1];
CO[6] += t3[0];
CO[7] += t3[1];
CO[8] += t4[0];
CO[9] += t4[1];
CO[10] += t5[0];
CO[11] += t5[1];
CO[12] += t6[0];
CO[13] += t6[1];
CO[14] += t7[0];
CO[15] += t7[1];
#endif
AO += temp << 4;
BO += temp;
CO += 16;
i -= 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (16, 1)
#endif
}
while (i >= 8)
for (i = 0; i < (m >> 3); i++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO += temp << 3;
BO += temp;
CO += 8;
i -= 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
while (i >= 4)
if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO += temp << 2;
BO += temp;
CO += 4;
i -= 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
}
while (i >= 2)
if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO += temp << 1;
BO += temp;
CO += 2;
i -= 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
while (i >= 1)
if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
CO[0] += t * alpha;
#endif
CO += 1;
i -= 1;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif

View File

@ -0,0 +1,326 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <altivec.h>
#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp09, ctemp17, ctemp33;
IFLOAT ctemp25, ctemp41;
IFLOAT ctemp49, ctemp57;
aoffset = a;
boffset = b;
j = (n >> 3);
if (j > 0){
do{
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset5 = aoffset4 + lda;
aoffset6 = aoffset5 + lda;
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
i = (m >> 3);
if (i > 0){
do{
PREFETCHA (aoffset1, 384);
PREFETCHA (aoffset2, 384);
PREFETCHA (aoffset3, 384);
PREFETCHA (aoffset4, 384);
PREFETCHA (aoffset5, 384);
PREFETCHA (aoffset6, 384);
PREFETCHA (aoffset7, 384);
PREFETCHA (aoffset8, 384);
__vector double va0 = *(__vector double*)(aoffset1 + 0);
__vector double va1 = *(__vector double*)(aoffset1 + 2);
__vector double va2 = *(__vector double*)(aoffset1 + 4);
__vector double va3 = *(__vector double*)(aoffset1 + 6);
__vector double va4 = *(__vector double*)(aoffset2 + 0);
__vector double va5 = *(__vector double*)(aoffset2 + 2);
__vector double va6 = *(__vector double*)(aoffset2 + 4);
__vector double va7 = *(__vector double*)(aoffset2 + 6);
__vector double va8 = *(__vector double*)(aoffset3 + 0);
__vector double va9 = *(__vector double*)(aoffset3 + 2);
__vector double va10 = *(__vector double*)(aoffset3 + 4);
__vector double va11 = *(__vector double*)(aoffset3 + 6);
__vector double va12 = *(__vector double*)(aoffset4 + 0);
__vector double va13 = *(__vector double*)(aoffset4 + 2);
__vector double va14 = *(__vector double*)(aoffset4 + 4);
__vector double va15 = *(__vector double*)(aoffset4 + 6);
__vector double va16 = *(__vector double*)(aoffset5 + 0);
__vector double va17 = *(__vector double*)(aoffset5 + 2);
__vector double va18 = *(__vector double*)(aoffset5 + 4);
__vector double va19 = *(__vector double*)(aoffset5 + 6);
__vector double va20 = *(__vector double*)(aoffset6 + 0);
__vector double va21 = *(__vector double*)(aoffset6 + 2);
__vector double va22 = *(__vector double*)(aoffset6 + 4);
__vector double va23 = *(__vector double*)(aoffset6 + 6);
__vector double va24 = *(__vector double*)(aoffset7 + 0);
__vector double va25 = *(__vector double*)(aoffset7 + 2);
__vector double va26 = *(__vector double*)(aoffset7 + 4);
__vector double va27 = *(__vector double*)(aoffset7 + 6);
__vector double va28 = *(__vector double*)(aoffset8 + 0);
__vector double va29 = *(__vector double*)(aoffset8 + 2);
__vector double va30 = *(__vector double*)(aoffset8 + 4);
__vector double va31 = *(__vector double*)(aoffset8 + 6);
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0);
*(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0);
*(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0);
*(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0);
*(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3);
*(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3);
*(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3);
*(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3);
*(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0);
*(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0);
*(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0);
*(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0);
*(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3);
*(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3);
*(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3);
*(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3);
*(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0);
*(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0);
*(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0);
*(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0);
*(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3);
*(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3);
*(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3);
*(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3);
*(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0);
*(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0);
*(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0);
*(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0);
*(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3);
*(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3);
*(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3);
*(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3);
aoffset1 += 8;
aoffset2 += 8;
aoffset3 += 8;
aoffset4 += 8;
aoffset5 += 8;
aoffset6 += 8;
aoffset7 += 8;
aoffset8 += 8;
boffset += 64;
i --;
}while(i > 0);
}
i = (m & 7);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp09 = *(aoffset2 + 0);
ctemp17 = *(aoffset3 + 0);
ctemp25 = *(aoffset4 + 0);
ctemp33 = *(aoffset5 + 0);
ctemp41 = *(aoffset6 + 0);
ctemp49 = *(aoffset7 + 0);
ctemp57 = *(aoffset8 + 0);
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp09;
*(boffset + 2) = ctemp17;
*(boffset + 3) = ctemp25;
*(boffset + 4) = ctemp33;
*(boffset + 5) = ctemp41;
*(boffset + 6) = ctemp49;
*(boffset + 7) = ctemp57;
aoffset1 ++;
aoffset2 ++;
aoffset3 ++;
aoffset4 ++;
aoffset5 ++;
aoffset6 ++;
aoffset7 ++;
aoffset8 ++;
boffset += 8;
i --;
}while(i > 0);
}
j--;
}while(j > 0);
} /* end of if(j > 0) */
if (n & 4){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
i = (m >> 2);
if (i > 0){
do{
PREFETCHA (aoffset1, 384);
PREFETCHA (aoffset2, 384);
PREFETCHA (aoffset3, 384);
PREFETCHA (aoffset4, 384);
__vector double va0 = *(__vector double*)(aoffset1 + 0);
__vector double va1 = *(__vector double*)(aoffset1 + 2);
__vector double va2 = *(__vector double*)(aoffset2 + 0);
__vector double va3 = *(__vector double*)(aoffset2 + 2);
__vector double va4 = *(__vector double*)(aoffset3 + 0);
__vector double va5 = *(__vector double*)(aoffset3 + 2);
__vector double va6 = *(__vector double*)(aoffset4 + 0);
__vector double va7 = *(__vector double*)(aoffset4 + 2);
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0);
*(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0);
*(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3);
*(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3);
*(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0);
*(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0);
*(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3);
*(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3);
aoffset1 += 4;
aoffset2 += 4;
aoffset3 += 4;
aoffset4 += 4;
boffset += 16;
i --;
}while(i > 0);
}
i = (m & 3);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset2 + 0);
ctemp03 = *(aoffset3 + 0);
ctemp04 = *(aoffset4 + 0);
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
aoffset1 ++;
aoffset2 ++;
aoffset3 ++;
aoffset4 ++;
boffset += 4;
i --;
}while(i > 0);
}
} /* end of if(j > 0) */
if (n & 2){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;
i = (m >> 1);
if (i > 0){
do{
__vector double va0 = *(__vector double*)(aoffset1 + 0);
__vector double va1 = *(__vector double*)(aoffset2 + 0);
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0);
*(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3);
aoffset1 += 2;
aoffset2 += 2;
boffset += 4;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset2 + 0);
*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
aoffset1 ++;
aoffset2 ++;
boffset += 2;
}
} /* end of if(j > 0) */
if (n & 1){
aoffset1 = aoffset;
i = m;
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
*(boffset + 0) = ctemp01;
aoffset1 ++;
boffset ++;
i --;
}while(i > 0);
}
} /* end of if(j > 0) */
return 0;
}

View File

@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
#endif
)
{
BLASLONG N = n;
BLASLONG i1;
#if defined(TRMMKERNEL)
BLASLONG off;
@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
#endif
v4sf_t valpha = { alpha, alpha, alpha, alpha };
N = n >> 3;
for (i1 = 0; i1 < N; i1++)
for (i1 = 0; i1 < (n >> 3); i1++)
{
BLASLONG i, j, temp;
BLASLONG j, temp;
FLOAT *CO;
FLOAT *AO;
#if defined(TRMMKERNEL) && defined(LEFT)
@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO = A;
PREFETCH1 (A, 128);
PREFETCH1 (A, 256);
i = m >> 4;
for (j = 0; j < i; j++)
for (j = 0; j < (m >> 4); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
#endif
CO += 16;
}
i = (m & 15) >> 3;
for (j = 0; j < i; j++)
if (m & 8)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (8, 8)
#endif
}
i = (m & 7) >> 2;
for (j = 0; j < i; j++)
if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (4, 8)
#endif
}
i = (m & 3) >> 1;
for (j = 0; j < i; j++)
if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (2, 8)
#endif
}
i = (m & 1) >> 0;
for (j = 0; j < i; j++)
if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
B += k << 3;
}
N = (n & 7) >> 2;
for (i1 = 0; i1 < N; i1++)
if (n & 4)
{
BLASLONG i, j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (16, 4)
#endif
}
i = (m & 15) >> 3;
for (j = 0; j < i; j++)
if (m & 8)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (8, 4)
#endif
}
i = (m & 7) >> 2;
for (j = 0; j < i; j++)
if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (4, 4)
#endif
}
i = (m & 3) >> 1;
for (j = 0; j < i; j++)
if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (2, 4)
#endif
}
i = (m & 1) >> 0;
for (j = 0; j < i; j++)
if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
B += k << 2;
}
N = (n & 3) >> 1;
for (i1 = 0; i1 < N; i1++)
if (n & 2)
{
BLASLONG i, j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (16, 2)
#endif
}
i = (m & 15) >> 3;
for (j = 0; j < i; j++)
if (m & 8)
{
FLOAT *BO;
v4sf_t *rowC;
@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (8, 2)
#endif
}
i = (m & 7) >> 2;
for (j = 0; j < i; j++)
if (m & 4)
{
FLOAT *BO;
v4sf_t *rowC;
@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (4, 2)
#endif
}
i = (m & 3) >> 1;
for (j = 0; j < i; j++)
if (m & 2)
{
FLOAT *BO;
BLASLONG l = 0;
@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
REFRESH_AFTER_SAVE (2, 2)
#endif
}
i = (m & 1) >> 0;
for (j = 0; j < i; j++)
if (m & 1)
{
FLOAT *BO;
BLASLONG l = 0;
@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
B += k << 1;
}
N = (n & 1) >> 0;
for (i1 = 0; i1 < N; i1++)
if (n & 1)
{
BLASLONG i, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
CO = C;
C += ldc;
AO = A;
i = m;
while (i >= 16)
for (i = 0; i < (m >> 4); i++)
{
FLOAT *BO;
BLASLONG l = 0;
@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO += temp << 4;
BO += temp;
CO += 16;
i -= 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (16, 1)
#endif
}
while (i >= 8)
if (m & 8)
{
FLOAT *BO;
BLASLONG l = 0;
@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO += temp << 3;
BO += temp;
CO += 8;
i -= 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
while (i >= 4)
if (m & 4)
{
FLOAT *BO;
BLASLONG l = 0;
@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO += temp << 2;
BO += temp;
CO += 4;
i -= 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
}
while (i >= 2)
if (m & 2)
{
FLOAT *BO;
BLASLONG l = 0;
@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
AO += temp << 1;
BO += temp;
CO += 2;
i -= 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
while (i >= 1)
if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
CO[0] += t * alpha;
#endif
CO += 1;
i -= 1;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif

View File

@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SBGEMM_DEFAULT_P 832
#define SBGEMM_DEFAULT_Q 1026
#define SBGEMM_DEFAULT_R 4096
#undef DGEMM_DEFAULT_UNROLL_M
#undef DGEMM_DEFAULT_UNROLL_N
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
#endif
#if defined(SPARC) && defined(V7)