Changed C files to straighten out indentation. Removed commented lines from other file.
This commit is contained in:
parent
461cf9083c
commit
87ba528d8b
|
@ -16,17 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|||
|
||||
STRMMKERNEL = sgemm_kernel_power10.c
|
||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||
ifeq ($(OSNAME), AIX)
|
||||
#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
||||
#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
||||
CTRMMKERNEL = cgemm_kernel_power10.c
|
||||
ZTRMMKERNEL = zgemm_kernel_power10.c
|
||||
else
|
||||
#CTRMMKERNEL = cgemm_kernel_power10.S
|
||||
#ZTRMMKERNEL = zgemm_kernel_power10.S
|
||||
CTRMMKERNEL = cgemm_kernel_power10.c
|
||||
ZTRMMKERNEL = zgemm_kernel_power10.c
|
||||
endif
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_power10.c
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
|
@ -68,13 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
|
|||
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
|
||||
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMKERNEL = cgemm_kernel_power10.c
|
||||
else
|
||||
#CGEMMKERNEL = cgemm_kernel_power10.S
|
||||
CGEMMKERNEL = cgemm_kernel_power10.c
|
||||
endif
|
||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
ifeq ($(OSNAME), AIX)
|
||||
|
@ -89,13 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMKERNEL = zgemm_kernel_power10.c
|
||||
else
|
||||
#ZGEMMKERNEL = zgemm_kernel_power10.S
|
||||
ZGEMMKERNEL = zgemm_kernel_power10.c
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
|
|
|
@ -507,12 +507,13 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
|||
* GEMM Kernel
|
||||
*************************************************************************************/
|
||||
int
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
|
||||
FLOAT * C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
, BLASLONG offset
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
|
||||
#else
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i1, i, l, temp;
|
||||
FLOAT *AO, *BO, *CO;
|
||||
|
@ -529,8 +530,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
FLOAT *res, tr[64], ti[64];
|
||||
res = (FLOAT *) result;
|
||||
|
||||
for (i1 = 0; i1 < (n >> 2); i1++)
|
||||
{
|
||||
for (i1 = 0; i1 < (n >> 2); i1++) {
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
@ -538,8 +538,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
CO = C;
|
||||
C += ldc << 3;
|
||||
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
for (i = 0; i < (m >> 3); i++) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 4);
|
||||
#else
|
||||
|
@ -547,8 +546,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < temp; ++l)
|
||||
{
|
||||
for (l = 0; l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||
|
@ -572,8 +570,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (8, 4)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
if (m & 4) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 4);
|
||||
#else
|
||||
|
@ -581,8 +578,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~1)); l+=2)
|
||||
{
|
||||
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
||||
|
@ -600,8 +596,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
|
||||
__builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
|
||||
}
|
||||
for (l = (temp & (~1)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~1)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||
|
@ -620,8 +615,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (4, 4)
|
||||
#endif
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
if (m & 2) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 4);
|
||||
#else
|
||||
|
@ -629,8 +623,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
||||
|
@ -652,8 +645,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
|
||||
__builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~3)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
|
||||
|
@ -668,8 +660,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (2, 4)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
if (m & 1) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 4)
|
||||
#else
|
||||
|
@ -677,8 +668,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
||||
|
@ -700,8 +690,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
|
||||
__builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~3)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
|
||||
|
@ -723,8 +712,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
B += k << 3;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
if (n & 2) {
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
@ -732,8 +720,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
CO = C;
|
||||
C += ldc << 2;
|
||||
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
for (i = 0; i < (m >> 3); i++) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 2)
|
||||
#else
|
||||
|
@ -741,8 +728,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~1)); l+=2)
|
||||
{
|
||||
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||
|
@ -762,8 +748,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
|
||||
__builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
|
||||
}
|
||||
for (l = (temp & (~1)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~1)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||
|
@ -782,8 +767,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (8, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
if (m & 4) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 2)
|
||||
#else
|
||||
|
@ -791,8 +775,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
||||
|
@ -814,8 +797,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
|
||||
__builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~3)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
|
@ -829,8 +811,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 2)
|
||||
#endif
|
||||
} if (m & 2)
|
||||
{
|
||||
}
|
||||
if (m & 2) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 2)
|
||||
#else
|
||||
|
@ -838,8 +820,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
||||
|
@ -865,8 +846,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
|
||||
__builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~7)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||
|
@ -879,18 +859,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (2, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
if (m & 1) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 2)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
// RIP OUT MMA STUFF!
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
||||
|
@ -916,8 +893,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~7)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||
|
@ -936,8 +912,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
B += k << 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
if (n & 1) {
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
@ -945,8 +920,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
CO = C;
|
||||
C += ldc << 1;
|
||||
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
for (i = 0; i < (m >> 3); i++) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 1)
|
||||
#else
|
||||
|
@ -954,8 +928,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~1)); l+=2)
|
||||
{
|
||||
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||
|
@ -975,8 +948,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
|
||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
|
||||
}
|
||||
for (l = (temp & (~1)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~1)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||
|
@ -995,8 +967,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (8, 1)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
if (m & 4) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 1)
|
||||
#else
|
||||
|
@ -1004,8 +975,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
||||
|
@ -1027,8 +997,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
|
||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~3)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
|
@ -1043,18 +1012,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (4, 1)
|
||||
#endif
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
if (m & 2) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
// RIP OUT MMA STUFF!
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
||||
|
@ -1080,8 +1046,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~7)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||
|
@ -1094,18 +1059,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (2, 1)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
if (m & 1) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
// RIP OUT MMA STUFF!
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
||||
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
||||
|
@ -1131,8 +1093,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~7)); l < temp; ++l) {
|
||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||
|
|
|
@ -276,12 +276,13 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
|||
* GEMM Kernel
|
||||
*************************************************************************************/
|
||||
int
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
|
||||
FLOAT * C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
, BLASLONG offset
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
|
||||
#else
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i1, i, l, temp;
|
||||
FLOAT *AO, *BO, *CO;
|
||||
|
@ -297,16 +298,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
FLOAT *res, tr[16], ti[16];
|
||||
res = (FLOAT *) result;
|
||||
|
||||
for (i1 = 0; i1 < (n >> 1); i1++)
|
||||
{
|
||||
for (i1 = 0; i1 < (n >> 1); i1++) {
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
AO = A;
|
||||
CO = C;
|
||||
C += ldc<<2;
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
for (i = 0; i < (m >> 3); i++) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 2)
|
||||
#else
|
||||
|
@ -314,8 +313,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < temp; ++l)
|
||||
{
|
||||
for (l = 0; l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||
|
@ -395,8 +393,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (8, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
if (m & 4) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 2)
|
||||
#else
|
||||
|
@ -404,8 +401,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~1)); l+=2)
|
||||
{
|
||||
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||
|
@ -423,8 +419,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||
}
|
||||
for (l = (temp & (~1)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~1)); l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
|
@ -443,8 +438,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (4, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 2)
|
||||
{
|
||||
if (m & 2) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 2)
|
||||
#else
|
||||
|
@ -452,8 +446,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||
|
@ -475,8 +468,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~3)); l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
|
@ -491,18 +483,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (2, 2)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
if (m & 1) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 2)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
// RIP OUT MMA STUFF!
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||
|
@ -524,8 +513,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~3)); l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||
|
@ -545,16 +533,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
#endif
|
||||
B += k << 2;
|
||||
}
|
||||
if (n & 1)
|
||||
{
|
||||
if (n & 1) {
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
AO = A;
|
||||
CO = C;
|
||||
C += ldc<<1;
|
||||
for (i = 0; i < (m >> 3); i++)
|
||||
{
|
||||
for (i = 0; i < (m >> 3); i++) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (8, 1)
|
||||
#else
|
||||
|
@ -562,8 +548,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~1)); l+=2)
|
||||
{
|
||||
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||
|
@ -583,8 +568,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
|
||||
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
|
||||
}
|
||||
for (l = (temp & (~1)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~1)); l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||
|
@ -604,8 +588,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (8, 1)
|
||||
#endif
|
||||
}
|
||||
if (m & 4)
|
||||
{
|
||||
if (m & 4) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (4, 1)
|
||||
#else
|
||||
|
@ -613,8 +596,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~3)); l+=4)
|
||||
{
|
||||
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||
|
@ -636,8 +618,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
|
||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
|
||||
}
|
||||
for (l = (temp & (~3)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~3)); l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
|
@ -651,8 +632,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
#if defined(TRMMKERNEL)
|
||||
REFRESH_AFTER_SAVE (4, 1)
|
||||
#endif
|
||||
} if (m & 2)
|
||||
{
|
||||
}
|
||||
if (m & 2) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (2, 1)
|
||||
#else
|
||||
|
@ -660,8 +641,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
temp = k;
|
||||
#endif
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||
|
@ -687,8 +667,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~7)); l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
|
@ -701,18 +680,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
REFRESH_AFTER_SAVE (2, 1)
|
||||
#endif
|
||||
}
|
||||
if (m & 1)
|
||||
{
|
||||
if (m & 1) {
|
||||
#if defined(TRMMKERNEL)
|
||||
REFRESH_POINTERS (1, 1)
|
||||
#else
|
||||
BO = B;
|
||||
temp = k;
|
||||
#endif
|
||||
// RIP OUT MMA STUFF!
|
||||
SET_ACC_ZERO()
|
||||
for (l = 0; l < (temp & (~7)); l+=8)
|
||||
{
|
||||
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||
|
@ -738,8 +714,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
|||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||
}
|
||||
for (l = (temp & (~7)); l < temp; ++l)
|
||||
{
|
||||
for (l = (temp & (~7)); l < temp; ++l) {
|
||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||
|
|
Loading…
Reference in New Issue