POWER10: Change dgemm unroll factors
Changing the unroll factors for dgemm to 8 shows improved performance with POWER10 MMA feature. Also made some minor changes in sgemm for edge cases.
This commit is contained in:
parent
aa21cb5217
commit
dd7a9cc5bf
|
@ -34,12 +34,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_power10.c
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
DGEMMONCOPY = dgemm_ncopy_8_power10.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
@ -69,7 +69,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
|
|
@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG N = n;
|
||||
BLASLONG i1;
|
||||
#if defined(TRMMKERNEL)
|
||||
BLASLONG off;
|
||||
|
@ -158,10 +157,221 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
off = -offset;
|
||||
#endif
|
||||
v4sf_t valpha = { alpha, alpha };
|
||||
N = n >> 2;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
for (i1 = 0; i1 < (n >> 3); i1++)
|
||||
{
|
||||
BLASLONG i, j, temp;
|
||||
BLASLONG j, temp;
|
||||
FLOAT *CO;
|
||||
FLOAT *AO;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
CO = C;
|
||||
C += ldc << 3;
|
||||
AO = A;
|
||||
PREFETCH1 (A, 128);
|
||||
PREFETCH1 (A, 256);
|
||||
for (j = 0; j < (m >> 3); j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 8);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
|
||||
BLASLONG l = 0;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC1 (&acc1, 0);
|
||||
SAVE_ACC (&acc2, 2);
|
||||
SAVE_ACC1 (&acc3, 2);
|
||||
SAVE_ACC (&acc4, 4);
|
||||
SAVE_ACC1 (&acc5, 4);
|
||||
SAVE_ACC (&acc6, 6);
|
||||
SAVE_ACC1 (&acc7, 6);
|
||||
CO += 8;
|
||||
AO += temp << 3;
|
||||
BO += temp << 3;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 8)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 8);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
BLASLONG l = 0;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC1 (&acc1, 0);
|
||||
SAVE_ACC (&acc2, 2);
|
||||
SAVE_ACC1 (&acc3, 2);
|
||||
CO += 4;
|
||||
AO += temp << 2;
|
||||
BO += temp << 3;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 8)
|
||||
#endif
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 8);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
BLASLONG l = 0;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC1 (&acc1, 0);
|
||||
CO += 2;
|
||||
AO += temp << 1;
|
||||
BO += temp << 3;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 8)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 8);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
v4sf_t t1 = { 0, 0 };
|
||||
v4sf_t t2 = { 0, 0 };
|
||||
v4sf_t t3 = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowA = { AO[l], AO[l] };
|
||||
v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
|
||||
v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
|
||||
v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
|
||||
v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
|
||||
t += rowA * rowB;
|
||||
t1 += rowA * rowB1;
|
||||
t2 += rowA * rowB2;
|
||||
t3 += rowA * rowB3;
|
||||
}
|
||||
t = t * valpha;
|
||||
t1 = t1 * valpha;
|
||||
t2 = t2 * valpha;
|
||||
t3 = t3 * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0 * ldc] = t[0];
|
||||
CO[1 * ldc] = t[1];
|
||||
CO[2 * ldc] = t1[0];
|
||||
CO[3 * ldc] = t1[1];
|
||||
CO[4 * ldc] = t2[0];
|
||||
CO[5 * ldc] = t2[1];
|
||||
CO[6 * ldc] = t3[0];
|
||||
CO[7 * ldc] = t3[1];
|
||||
#else
|
||||
CO[0 * ldc] += t[0];
|
||||
CO[1 * ldc] += t[1];
|
||||
CO[2 * ldc] += t1[0];
|
||||
CO[3 * ldc] += t1[1];
|
||||
CO[4 * ldc] += t2[0];
|
||||
CO[5 * ldc] += t2[1];
|
||||
CO[6 * ldc] += t3[0];
|
||||
CO[7 * ldc] += t3[1];
|
||||
#endif
|
||||
CO += 1;
|
||||
AO += temp;
|
||||
BO += temp << 3;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 8)
|
||||
#endif
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 8; // number of values in A
|
||||
#endif
|
||||
B += k << 3;
|
||||
}
|
||||
if (n & 4)
|
||||
{
|
||||
BLASLONG j, temp;
|
||||
FLOAT *CO;
|
||||
FLOAT *AO;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
|
@ -172,71 +382,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO = A;
|
||||
PREFETCH1 (A, 128);
|
||||
PREFETCH1 (A, 256);
|
||||
i = m >> 4;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (16, 4);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
BLASLONG l = 0;
|
||||
PREFETCH1 (CO, 0);
|
||||
PREFETCH1 (CO + ldc, 0);
|
||||
PREFETCH1 (CO + ldc + ldc, 0);
|
||||
PREFETCH1 (CO + ldc + ldc + ldc, 0);
|
||||
PREFETCH1 (CO, 128);
|
||||
PREFETCH1 (CO + ldc, 128);
|
||||
PREFETCH1 (CO + ldc + ldc, 128);
|
||||
PREFETCH1 (CO + ldc + ldc + ldc, 128);
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 4];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
SAVE_ACC (&acc2, 4);
|
||||
SAVE_ACC (&acc1, 2);
|
||||
SAVE_ACC (&acc3, 6);
|
||||
SAVE_ACC (&acc4, 8);
|
||||
SAVE_ACC (&acc6, 12);
|
||||
SAVE_ACC (&acc5, 10);
|
||||
SAVE_ACC (&acc7, 14);
|
||||
AO += temp << 4;
|
||||
BO += temp << 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (16, 4)
|
||||
#endif
|
||||
CO += 16;
|
||||
}
|
||||
i = (m & 15) >> 3;
|
||||
for (j = 0; j < i; j++)
|
||||
for (j = 0; j < (m >> 3); j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (8, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 7) >> 2;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (4, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 3) >> 1;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (2, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 1) >> 0;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
#endif
|
||||
B += k << 2;
|
||||
}
|
||||
N = (n & 3) >> 1;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
if (n & 2)
|
||||
{
|
||||
BLASLONG i, j, temp;
|
||||
BLASLONG j, temp;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
CO = C;
|
||||
C += ldc << 1;
|
||||
AO = A;
|
||||
i = m >> 4;
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (16, 2);
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
v4sf_t *rowC;
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 4];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
|
||||
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
|
||||
__builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
|
||||
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
|
||||
__builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
|
||||
}
|
||||
SAVE2x4_ACC (&acc0, 0);
|
||||
SAVE2x4_ACC (&acc1, 2);
|
||||
SAVE2x4_ACC (&acc2, 4);
|
||||
SAVE2x4_ACC (&acc3, 6);
|
||||
SAVE2x4_ACC (&acc4, 8);
|
||||
SAVE2x4_ACC (&acc5, 10);
|
||||
SAVE2x4_ACC (&acc6, 12);
|
||||
SAVE2x4_ACC (&acc7, 14);
|
||||
CO += 16;
|
||||
AO += temp << 4;
|
||||
BO += temp << 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (16, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 15) >> 3;
|
||||
for (j = 0; j < i; j++)
|
||||
for (j = 0; j < (m >> 3); j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (8, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 7) >> 2;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (4, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 3) >> 1;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (2, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 1) >> 0;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
#endif
|
||||
B += k << 1;
|
||||
}
|
||||
N = (n & 1) >> 0;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
if (n & 1)
|
||||
{
|
||||
BLASLONG i, temp;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
|
@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
CO = C;
|
||||
C += ldc;
|
||||
AO = A;
|
||||
i = m;
|
||||
while (i >= 16)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (16, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
BLASLONG l = 0;
|
||||
v4sf_t t = { 0, 0 };
|
||||
v4sf_t t1 = { 0, 0 };
|
||||
v4sf_t t2 = { 0, 0 };
|
||||
v4sf_t t3 = { 0, 0 };
|
||||
v4sf_t t4 = { 0, 0 };
|
||||
v4sf_t t5 = { 0, 0 };
|
||||
v4sf_t t6 = { 0, 0 };
|
||||
v4sf_t t7 = { 0, 0 };
|
||||
for (l = 0; l < temp; l++)
|
||||
{
|
||||
v4sf_t rowB = { BO[l], BO[l] };
|
||||
v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
|
||||
v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
|
||||
v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
|
||||
v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
|
||||
v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
|
||||
v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
|
||||
v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
|
||||
v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
|
||||
t += rowA * rowB;
|
||||
t1 += rowA1 * rowB;
|
||||
t2 += rowA2 * rowB;
|
||||
t3 += rowA3 * rowB;
|
||||
t4 += rowA4 * rowB;
|
||||
t5 += rowA5 * rowB;
|
||||
t6 += rowA6 * rowB;
|
||||
t7 += rowA7 * rowB;
|
||||
}
|
||||
t = t * valpha;
|
||||
t1 = t1 * valpha;
|
||||
t2 = t2 * valpha;
|
||||
t3 = t3 * valpha;
|
||||
t4 = t4 * valpha;
|
||||
t5 = t5 * valpha;
|
||||
t6 = t6 * valpha;
|
||||
t7 = t7 * valpha;
|
||||
#if defined(TRMMKERNEL)
|
||||
CO[0] = t[0];
|
||||
CO[1] = t[1];
|
||||
CO[2] = t1[0];
|
||||
CO[3] = t1[1];
|
||||
CO[4] = t2[0];
|
||||
CO[5] = t2[1];
|
||||
CO[6] = t3[0];
|
||||
CO[7] = t3[1];
|
||||
CO[8] = t4[0];
|
||||
CO[9] = t4[1];
|
||||
CO[10] = t5[0];
|
||||
CO[11] = t5[1];
|
||||
CO[12] = t6[0];
|
||||
CO[13] = t6[1];
|
||||
CO[14] = t7[0];
|
||||
CO[15] = t7[1];
|
||||
#else
|
||||
CO[0] += t[0];
|
||||
CO[1] += t[1];
|
||||
CO[2] += t1[0];
|
||||
CO[3] += t1[1];
|
||||
CO[4] += t2[0];
|
||||
CO[5] += t2[1];
|
||||
CO[6] += t3[0];
|
||||
CO[7] += t3[1];
|
||||
CO[8] += t4[0];
|
||||
CO[9] += t4[1];
|
||||
CO[10] += t5[0];
|
||||
CO[11] += t5[1];
|
||||
CO[12] += t6[0];
|
||||
CO[13] += t6[1];
|
||||
CO[14] += t7[0];
|
||||
CO[15] += t7[1];
|
||||
#endif
|
||||
AO += temp << 4;
|
||||
BO += temp;
|
||||
CO += 16;
|
||||
i -= 16;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (16, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 8)
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO += temp << 3;
|
||||
BO += temp;
|
||||
CO += 8;
|
||||
i -= 8;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 4)
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO += temp << 2;
|
||||
BO += temp;
|
||||
CO += 4;
|
||||
i -= 4;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 2)
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO += temp << 1;
|
||||
BO += temp;
|
||||
CO += 2;
|
||||
i -= 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 1)
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
CO[0] += t * alpha;
|
||||
#endif
|
||||
CO += 1;
|
||||
i -= 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 1)
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,326 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
BLASLONG i, j;
|
||||
|
||||
IFLOAT *aoffset;
|
||||
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||
|
||||
IFLOAT *boffset;
|
||||
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
IFLOAT ctemp09, ctemp17, ctemp33;
|
||||
IFLOAT ctemp25, ctemp41;
|
||||
IFLOAT ctemp49, ctemp57;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = (n >> 3);
|
||||
if (j > 0){
|
||||
do{
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (m >> 3);
|
||||
if (i > 0){
|
||||
do{
|
||||
PREFETCHA (aoffset1, 384);
|
||||
PREFETCHA (aoffset2, 384);
|
||||
PREFETCHA (aoffset3, 384);
|
||||
PREFETCHA (aoffset4, 384);
|
||||
PREFETCHA (aoffset5, 384);
|
||||
PREFETCHA (aoffset6, 384);
|
||||
PREFETCHA (aoffset7, 384);
|
||||
PREFETCHA (aoffset8, 384);
|
||||
__vector double va0 = *(__vector double*)(aoffset1 + 0);
|
||||
__vector double va1 = *(__vector double*)(aoffset1 + 2);
|
||||
__vector double va2 = *(__vector double*)(aoffset1 + 4);
|
||||
__vector double va3 = *(__vector double*)(aoffset1 + 6);
|
||||
|
||||
__vector double va4 = *(__vector double*)(aoffset2 + 0);
|
||||
__vector double va5 = *(__vector double*)(aoffset2 + 2);
|
||||
__vector double va6 = *(__vector double*)(aoffset2 + 4);
|
||||
__vector double va7 = *(__vector double*)(aoffset2 + 6);
|
||||
|
||||
__vector double va8 = *(__vector double*)(aoffset3 + 0);
|
||||
__vector double va9 = *(__vector double*)(aoffset3 + 2);
|
||||
__vector double va10 = *(__vector double*)(aoffset3 + 4);
|
||||
__vector double va11 = *(__vector double*)(aoffset3 + 6);
|
||||
|
||||
__vector double va12 = *(__vector double*)(aoffset4 + 0);
|
||||
__vector double va13 = *(__vector double*)(aoffset4 + 2);
|
||||
__vector double va14 = *(__vector double*)(aoffset4 + 4);
|
||||
__vector double va15 = *(__vector double*)(aoffset4 + 6);
|
||||
|
||||
__vector double va16 = *(__vector double*)(aoffset5 + 0);
|
||||
__vector double va17 = *(__vector double*)(aoffset5 + 2);
|
||||
__vector double va18 = *(__vector double*)(aoffset5 + 4);
|
||||
__vector double va19 = *(__vector double*)(aoffset5 + 6);
|
||||
|
||||
__vector double va20 = *(__vector double*)(aoffset6 + 0);
|
||||
__vector double va21 = *(__vector double*)(aoffset6 + 2);
|
||||
__vector double va22 = *(__vector double*)(aoffset6 + 4);
|
||||
__vector double va23 = *(__vector double*)(aoffset6 + 6);
|
||||
|
||||
__vector double va24 = *(__vector double*)(aoffset7 + 0);
|
||||
__vector double va25 = *(__vector double*)(aoffset7 + 2);
|
||||
__vector double va26 = *(__vector double*)(aoffset7 + 4);
|
||||
__vector double va27 = *(__vector double*)(aoffset7 + 6);
|
||||
|
||||
__vector double va28 = *(__vector double*)(aoffset8 + 0);
|
||||
__vector double va29 = *(__vector double*)(aoffset8 + 2);
|
||||
__vector double va30 = *(__vector double*)(aoffset8 + 4);
|
||||
__vector double va31 = *(__vector double*)(aoffset8 + 6);
|
||||
|
||||
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0);
|
||||
*(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0);
|
||||
*(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0);
|
||||
*(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0);
|
||||
*(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3);
|
||||
*(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3);
|
||||
*(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3);
|
||||
*(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3);
|
||||
|
||||
*(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0);
|
||||
*(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0);
|
||||
*(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0);
|
||||
*(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0);
|
||||
*(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3);
|
||||
*(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3);
|
||||
*(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3);
|
||||
*(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3);
|
||||
|
||||
*(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0);
|
||||
*(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0);
|
||||
*(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0);
|
||||
*(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0);
|
||||
*(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3);
|
||||
*(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3);
|
||||
*(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3);
|
||||
*(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3);
|
||||
|
||||
*(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0);
|
||||
*(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0);
|
||||
*(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0);
|
||||
*(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0);
|
||||
*(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3);
|
||||
*(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3);
|
||||
*(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3);
|
||||
*(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3);
|
||||
aoffset1 += 8;
|
||||
aoffset2 += 8;
|
||||
aoffset3 += 8;
|
||||
aoffset4 += 8;
|
||||
aoffset5 += 8;
|
||||
aoffset6 += 8;
|
||||
aoffset7 += 8;
|
||||
aoffset8 += 8;
|
||||
boffset += 64;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
i = (m & 7);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp09 = *(aoffset2 + 0);
|
||||
ctemp17 = *(aoffset3 + 0);
|
||||
ctemp25 = *(aoffset4 + 0);
|
||||
ctemp33 = *(aoffset5 + 0);
|
||||
ctemp41 = *(aoffset6 + 0);
|
||||
ctemp49 = *(aoffset7 + 0);
|
||||
ctemp57 = *(aoffset8 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp09;
|
||||
*(boffset + 2) = ctemp17;
|
||||
*(boffset + 3) = ctemp25;
|
||||
*(boffset + 4) = ctemp33;
|
||||
*(boffset + 5) = ctemp41;
|
||||
*(boffset + 6) = ctemp49;
|
||||
*(boffset + 7) = ctemp57;
|
||||
|
||||
aoffset1 ++;
|
||||
aoffset2 ++;
|
||||
aoffset3 ++;
|
||||
aoffset4 ++;
|
||||
aoffset5 ++;
|
||||
aoffset6 ++;
|
||||
aoffset7 ++;
|
||||
aoffset8 ++;
|
||||
|
||||
boffset += 8;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
j--;
|
||||
}while(j > 0);
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 4){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
i = (m >> 2);
|
||||
if (i > 0){
|
||||
do{
|
||||
PREFETCHA (aoffset1, 384);
|
||||
PREFETCHA (aoffset2, 384);
|
||||
PREFETCHA (aoffset3, 384);
|
||||
PREFETCHA (aoffset4, 384);
|
||||
__vector double va0 = *(__vector double*)(aoffset1 + 0);
|
||||
__vector double va1 = *(__vector double*)(aoffset1 + 2);
|
||||
__vector double va2 = *(__vector double*)(aoffset2 + 0);
|
||||
__vector double va3 = *(__vector double*)(aoffset2 + 2);
|
||||
__vector double va4 = *(__vector double*)(aoffset3 + 0);
|
||||
__vector double va5 = *(__vector double*)(aoffset3 + 2);
|
||||
__vector double va6 = *(__vector double*)(aoffset4 + 0);
|
||||
__vector double va7 = *(__vector double*)(aoffset4 + 2);
|
||||
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0);
|
||||
*(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0);
|
||||
*(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3);
|
||||
*(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3);
|
||||
*(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0);
|
||||
*(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0);
|
||||
*(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3);
|
||||
*(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3);
|
||||
|
||||
aoffset1 += 4;
|
||||
aoffset2 += 4;
|
||||
aoffset3 += 4;
|
||||
aoffset4 += 4;
|
||||
boffset += 16;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
i = (m & 3);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
ctemp03 = *(aoffset3 + 0);
|
||||
ctemp04 = *(aoffset4 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
*(boffset + 2) = ctemp03;
|
||||
*(boffset + 3) = ctemp04;
|
||||
|
||||
aoffset1 ++;
|
||||
aoffset2 ++;
|
||||
aoffset3 ++;
|
||||
aoffset4 ++;
|
||||
|
||||
boffset += 4;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 2){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset += 2 * lda;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
__vector double va0 = *(__vector double*)(aoffset1 + 0);
|
||||
__vector double va1 = *(__vector double*)(aoffset2 + 0);
|
||||
*(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0);
|
||||
*(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3);
|
||||
|
||||
aoffset1 += 2;
|
||||
aoffset2 += 2;
|
||||
boffset += 4;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset2 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
*(boffset + 1) = ctemp02;
|
||||
|
||||
aoffset1 ++;
|
||||
aoffset2 ++;
|
||||
boffset += 2;
|
||||
}
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 1){
|
||||
aoffset1 = aoffset;
|
||||
|
||||
i = m;
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
|
||||
*(boffset + 0) = ctemp01;
|
||||
|
||||
aoffset1 ++;
|
||||
boffset ++;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG N = n;
|
||||
BLASLONG i1;
|
||||
#if defined(TRMMKERNEL)
|
||||
BLASLONG off;
|
||||
|
@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
#endif
|
||||
|
||||
v4sf_t valpha = { alpha, alpha, alpha, alpha };
|
||||
N = n >> 3;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
for (i1 = 0; i1 < (n >> 3); i1++)
|
||||
{
|
||||
BLASLONG i, j, temp;
|
||||
BLASLONG j, temp;
|
||||
FLOAT *CO;
|
||||
FLOAT *AO;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
|
@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO = A;
|
||||
PREFETCH1 (A, 128);
|
||||
PREFETCH1 (A, 256);
|
||||
i = m >> 4;
|
||||
for (j = 0; j < i; j++)
|
||||
for (j = 0; j < (m >> 4); j++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
#endif
|
||||
CO += 16;
|
||||
}
|
||||
i = (m & 15) >> 3;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 8)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (8, 8)
|
||||
#endif
|
||||
}
|
||||
i = (m & 7) >> 2;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (4, 8)
|
||||
#endif
|
||||
}
|
||||
i = (m & 3) >> 1;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (2, 8)
|
||||
#endif
|
||||
}
|
||||
i = (m & 1) >> 0;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
|
||||
B += k << 3;
|
||||
}
|
||||
N = (n & 7) >> 2;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
if (n & 4)
|
||||
{
|
||||
BLASLONG i, j, temp;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
|
@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (16, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 15) >> 3;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 8)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (8, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 7) >> 2;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (4, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 3) >> 1;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (2, 4)
|
||||
#endif
|
||||
}
|
||||
i = (m & 1) >> 0;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
|
||||
B += k << 2;
|
||||
}
|
||||
N = (n & 3) >> 1;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
if (n & 2)
|
||||
{
|
||||
BLASLONG i, j, temp;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
|
@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (16, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 15) >> 3;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 8)
|
||||
{
|
||||
FLOAT *BO;
|
||||
v4sf_t *rowC;
|
||||
|
@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (8, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 7) >> 2;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
v4sf_t *rowC;
|
||||
|
@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (4, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 3) >> 1;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
BLASLONG l = 0;
|
||||
|
@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
REFRESH_AFTER_SAVE (2, 2)
|
||||
#endif
|
||||
}
|
||||
i = (m & 1) >> 0;
|
||||
for (j = 0; j < i; j++)
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
BLASLONG l = 0;
|
||||
|
@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
|
||||
B += k << 1;
|
||||
}
|
||||
N = (n & 1) >> 0;
|
||||
for (i1 = 0; i1 < N; i1++)
|
||||
if (n & 1)
|
||||
{
|
||||
BLASLONG i, temp;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
|
@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
CO = C;
|
||||
C += ldc;
|
||||
AO = A;
|
||||
i = m;
|
||||
while (i >= 16)
|
||||
for (i = 0; i < (m >> 4); i++)
|
||||
{
|
||||
FLOAT *BO;
|
||||
BLASLONG l = 0;
|
||||
|
@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO += temp << 4;
|
||||
BO += temp;
|
||||
CO += 16;
|
||||
i -= 16;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (16, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 8)
|
||||
if (m & 8)
|
||||
{
|
||||
FLOAT *BO;
|
||||
BLASLONG l = 0;
|
||||
|
@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO += temp << 3;
|
||||
BO += temp;
|
||||
CO += 8;
|
||||
i -= 8;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (8, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 4)
|
||||
if (m & 4)
|
||||
{
|
||||
FLOAT *BO;
|
||||
BLASLONG l = 0;
|
||||
|
@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO += temp << 2;
|
||||
BO += temp;
|
||||
CO += 4;
|
||||
i -= 4;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 2)
|
||||
if (m & 2)
|
||||
{
|
||||
FLOAT *BO;
|
||||
BLASLONG l = 0;
|
||||
|
@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
AO += temp << 1;
|
||||
BO += temp;
|
||||
CO += 2;
|
||||
i -= 2;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (2, 1)
|
||||
#endif
|
||||
}
|
||||
while (i >= 1)
|
||||
if (m & 1)
|
||||
{
|
||||
FLOAT *BO;
|
||||
#if defined(TRMMKERNEL)
|
||||
|
@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
CO[0] += t * alpha;
|
||||
#endif
|
||||
CO += 1;
|
||||
i -= 1;
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (1, 1)
|
||||
#endif
|
||||
|
|
4
param.h
4
param.h
|
@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SBGEMM_DEFAULT_P 832
|
||||
#define SBGEMM_DEFAULT_Q 1026
|
||||
#define SBGEMM_DEFAULT_R 4096
|
||||
#undef DGEMM_DEFAULT_UNROLL_M
|
||||
#undef DGEMM_DEFAULT_UNROLL_N
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
#endif
|
||||
|
||||
#if defined(SPARC) && defined(V7)
|
||||
|
|
Loading…
Reference in New Issue