Changed C files to straighten out indentation. Removed commented lines from other file.

This commit is contained in:
austinpagan 2024-02-01 18:46:07 -06:00
parent 461cf9083c
commit 87ba528d8b
3 changed files with 1403 additions and 1488 deletions

View File

@ -16,17 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMKERNEL = sgemm_kernel_power10.c
DTRMMKERNEL = dgemm_kernel_power10.c
ifeq ($(OSNAME), AIX)
#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
CTRMMKERNEL = cgemm_kernel_power10.c
ZTRMMKERNEL = zgemm_kernel_power10.c
else
#CTRMMKERNEL = cgemm_kernel_power10.S
#ZTRMMKERNEL = zgemm_kernel_power10.S
CTRMMKERNEL = cgemm_kernel_power10.c
ZTRMMKERNEL = zgemm_kernel_power10.c
endif
SGEMMKERNEL = sgemm_kernel_power10.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
@ -68,13 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
ifeq ($(OSNAME), AIX)
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMKERNEL = cgemm_kernel_power10.c
else
#CGEMMKERNEL = cgemm_kernel_power10.S
CGEMMKERNEL = cgemm_kernel_power10.c
endif
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
ifeq ($(OSNAME), AIX)
@ -89,13 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
ifeq ($(OSNAME), AIX)
#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMKERNEL = zgemm_kernel_power10.c
else
#ZGEMMKERNEL = zgemm_kernel_power10.S
ZGEMMKERNEL = zgemm_kernel_power10.c
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c

View File

@ -507,12 +507,13 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
* GEMM Kernel
*************************************************************************************/
int
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
FLOAT * C, BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
#else
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
#endif
)
{
BLASLONG i1, i, l, temp;
FLOAT *AO, *BO, *CO;
@ -529,8 +530,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
FLOAT *res, tr[64], ti[64];
res = (FLOAT *) result;
for (i1 = 0; i1 < (n >> 2); i1++)
{
for (i1 = 0; i1 < (n >> 2); i1++) {
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
@ -538,8 +538,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
CO = C;
C += ldc << 3;
for (i = 0; i < (m >> 3); i++)
{
for (i = 0; i < (m >> 3); i++) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 4);
#else
@ -547,8 +546,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < temp; ++l)
{
for (l = 0; l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<4];
vec_t rowB1 = *(vec_t *) & BO[l<<3];
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
@ -572,8 +570,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (8, 4)
#endif
}
if (m & 4)
{
if (m & 4) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 4);
#else
@ -581,8 +578,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
for (l = 0; l < (temp & (~1)); l+=2) {
vec_t rowA1 = *(vec_t *) & AO[l<<3];
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
@ -600,8 +596,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
__builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
}
for (l = (temp & (~1)); l < temp; ++l)
{
for (l = (temp & (~1)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<3];
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
vec_t rowB1 = *(vec_t *) & BO[l<<3];
@ -620,8 +615,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (4, 4)
#endif
}
if (m & 2)
{
if (m & 2) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 4);
#else
@ -629,8 +623,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
for (l = 0; l < (temp & (~3)); l+=4) {
vec_t rowA1 = *(vec_t *) & AO[l<<2];
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
@ -652,8 +645,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
for (l = (temp & (~3)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<2];
vec_t rowB1 = *(vec_t *) & BO[l<<3];
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
@ -668,8 +660,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (2, 4)
#endif
}
if (m & 1)
{
if (m & 1) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 4)
#else
@ -677,8 +668,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
for (l = 0; l < (temp & (~3)); l+=4) {
vec_t rowA1 = *(vec_t *) & AO[l<<1];
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
@ -700,8 +690,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
__builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
for (l = (temp & (~3)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<1];
vec_t rowB1 = *(vec_t *) & BO[l<<3];
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
@ -723,8 +712,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
B += k << 3;
}
if (n & 2)
{
if (n & 2) {
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
@ -732,8 +720,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
CO = C;
C += ldc << 2;
for (i = 0; i < (m >> 3); i++)
{
for (i = 0; i < (m >> 3); i++) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 2)
#else
@ -741,8 +728,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
for (l = 0; l < (temp & (~1)); l+=2) {
vec_t rowA1 = *(vec_t *) & AO[l<<4];
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
@ -762,8 +748,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
__builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
}
for (l = (temp & (~1)); l < temp; ++l)
{
for (l = (temp & (~1)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<4];
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
@ -782,8 +767,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (8, 2)
#endif
}
if (m & 4)
{
if (m & 4) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 2)
#else
@ -791,8 +775,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
for (l = 0; l < (temp & (~3)); l+=4) {
vec_t rowA1 = *(vec_t *) & AO[l<<3];
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
@ -814,8 +797,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
__builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
}
for (l = (temp & (~3)); l < temp; ++l)
{
for (l = (temp & (~3)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<3];
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
vec_t rowB1 = *(vec_t *) & BO[l<<2];
@ -829,8 +811,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 2)
#endif
} if (m & 2)
{
}
if (m & 2) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 2)
#else
@ -838,8 +820,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
for (l = 0; l < (temp & (~7)); l+=8) {
vec_t rowA1 = *(vec_t *) & AO[l<<2];
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
@ -865,8 +846,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
__builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
for (l = (temp & (~7)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<2];
vec_t rowB1 = *(vec_t *) & BO[l<<2];
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
@ -879,18 +859,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (2, 2)
#endif
}
if (m & 1)
{
if (m & 1) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 2)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
for (l = 0; l < (temp & (~7)); l+=8) {
vec_t rowA1 = *(vec_t *) & AO[l<<1];
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
@ -916,8 +893,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
for (l = (temp & (~7)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<1];
vec_t rowB1 = *(vec_t *) & BO[l<<2];
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
@ -936,8 +912,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
B += k << 2;
}
if (n & 1)
{
if (n & 1) {
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
@ -945,8 +920,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
CO = C;
C += ldc << 1;
for (i = 0; i < (m >> 3); i++)
{
for (i = 0; i < (m >> 3); i++) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 1)
#else
@ -954,8 +928,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
for (l = 0; l < (temp & (~1)); l+=2) {
vec_t rowA1 = *(vec_t *) & AO[l<<4];
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
@ -975,8 +948,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
}
for (l = (temp & (~1)); l < temp; ++l)
{
for (l = (temp & (~1)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<4];
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
@ -995,8 +967,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (8, 1)
#endif
}
if (m & 4)
{
if (m & 4) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 1)
#else
@ -1004,8 +975,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
for (l = 0; l < (temp & (~3)); l+=4) {
vec_t rowA1 = *(vec_t *) & AO[l<<3];
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
@ -1027,8 +997,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
}
for (l = (temp & (~3)); l < temp; ++l)
{
for (l = (temp & (~3)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<3];
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
vec_t rowB1 = *(vec_t *) & BO[l<<1];
@ -1043,18 +1012,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (4, 1)
#endif
}
if (m & 2)
{
if (m & 2) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 1)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
for (l = 0; l < (temp & (~7)); l+=8) {
vec_t rowA1 = *(vec_t *) & AO[l<<2];
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
@ -1080,8 +1046,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
for (l = (temp & (~7)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<2];
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
@ -1094,18 +1059,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (2, 1)
#endif
}
if (m & 1)
{
if (m & 1) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 1)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
for (l = 0; l < (temp & (~7)); l+=8) {
vec_t rowA1 = *(vec_t *) & AO[l<<1];
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
@ -1131,8 +1093,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
for (l = (temp & (~7)); l < temp; ++l) {
vec_t rowA1 = *(vec_t *) & AO[l<<1];
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);

View File

@ -276,12 +276,13 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
* GEMM Kernel
*************************************************************************************/
int
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
FLOAT * C, BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
#else
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
#endif
)
{
BLASLONG i1, i, l, temp;
FLOAT *AO, *BO, *CO;
@ -297,16 +298,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
FLOAT *res, tr[16], ti[16];
res = (FLOAT *) result;
for (i1 = 0; i1 < (n >> 1); i1++)
{
for (i1 = 0; i1 < (n >> 1); i1++) {
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<2;
for (i = 0; i < (m >> 3); i++)
{
for (i = 0; i < (m >> 3); i++) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 2)
#else
@ -314,8 +313,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < temp; ++l)
{
for (l = 0; l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
@ -395,8 +393,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (8, 2)
#endif
}
if (m & 4)
{
if (m & 4) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 2)
#else
@ -404,8 +401,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
for (l = 0; l < (temp & (~1)); l+=2) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
@ -423,8 +419,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
}
for (l = (temp & (~1)); l < temp; ++l)
{
for (l = (temp & (~1)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
@ -443,8 +438,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (4, 2)
#endif
}
if (m & 2)
{
if (m & 2) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 2)
#else
@ -452,8 +446,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
for (l = 0; l < (temp & (~3)); l+=4) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
@ -475,8 +468,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
for (l = (temp & (~3)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
@ -491,18 +483,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (2, 2)
#endif
}
if (m & 1)
{
if (m & 1) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 2)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
for (l = 0; l < (temp & (~3)); l+=4) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
@ -524,8 +513,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
}
for (l = (temp & (~3)); l < temp; ++l)
{
for (l = (temp & (~3)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<2];
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
@ -545,16 +533,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
#endif
B += k << 2;
}
if (n & 1)
{
if (n & 1) {
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
AO = A;
CO = C;
C += ldc<<1;
for (i = 0; i < (m >> 3); i++)
{
for (i = 0; i < (m >> 3); i++) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 1)
#else
@ -562,8 +548,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~1)); l+=2)
{
for (l = 0; l < (temp & (~1)); l+=2) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
@ -583,8 +568,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
}
for (l = (temp & (~1)); l < temp; ++l)
{
for (l = (temp & (~1)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
@ -604,8 +588,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (8, 1)
#endif
}
if (m & 4)
{
if (m & 4) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 1)
#else
@ -613,8 +596,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~3)); l+=4)
{
for (l = 0; l < (temp & (~3)); l+=4) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
@ -636,8 +618,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
}
for (l = (temp & (~3)); l < temp; ++l)
{
for (l = (temp & (~3)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
@ -651,8 +632,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
} if (m & 2)
{
}
if (m & 2) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (2, 1)
#else
@ -660,8 +641,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
temp = k;
#endif
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
for (l = 0; l < (temp & (~7)); l+=8) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
@ -687,8 +667,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
for (l = (temp & (~7)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
@ -701,18 +680,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
REFRESH_AFTER_SAVE (2, 1)
#endif
}
if (m & 1)
{
if (m & 1) {
#if defined(TRMMKERNEL)
REFRESH_POINTERS (1, 1)
#else
BO = B;
temp = k;
#endif
// RIP OUT MMA STUFF!
SET_ACC_ZERO()
for (l = 0; l < (temp & (~7)); l+=8)
{
for (l = 0; l < (temp & (~7)); l+=8) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
@ -738,8 +714,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
}
for (l = (temp & (~7)); l < temp; ++l)
{
for (l = (temp & (~7)); l < temp; ++l) {
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
vec_t rowB1 = *(vec_t *) & BO[l<<1];
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);