Changed C files to straighten out indentation. Removed commented lines from other file.
This commit is contained in:
parent
461cf9083c
commit
87ba528d8b
|
@ -16,17 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
STRMMKERNEL = sgemm_kernel_power10.c
|
STRMMKERNEL = sgemm_kernel_power10.c
|
||||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||||
ifeq ($(OSNAME), AIX)
|
|
||||||
#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
|
||||||
#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
|
||||||
CTRMMKERNEL = cgemm_kernel_power10.c
|
CTRMMKERNEL = cgemm_kernel_power10.c
|
||||||
ZTRMMKERNEL = zgemm_kernel_power10.c
|
ZTRMMKERNEL = zgemm_kernel_power10.c
|
||||||
else
|
|
||||||
#CTRMMKERNEL = cgemm_kernel_power10.S
|
|
||||||
#ZTRMMKERNEL = zgemm_kernel_power10.S
|
|
||||||
CTRMMKERNEL = cgemm_kernel_power10.c
|
|
||||||
ZTRMMKERNEL = zgemm_kernel_power10.c
|
|
||||||
endif
|
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_power10.c
|
SGEMMKERNEL = sgemm_kernel_power10.c
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
@ -68,13 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
|
||||||
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
|
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
|
||||||
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
|
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
|
||||||
|
|
||||||
ifeq ($(OSNAME), AIX)
|
|
||||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
|
||||||
CGEMMKERNEL = cgemm_kernel_power10.c
|
CGEMMKERNEL = cgemm_kernel_power10.c
|
||||||
else
|
|
||||||
#CGEMMKERNEL = cgemm_kernel_power10.S
|
|
||||||
CGEMMKERNEL = cgemm_kernel_power10.c
|
|
||||||
endif
|
|
||||||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
ifeq ($(OSNAME), AIX)
|
ifeq ($(OSNAME), AIX)
|
||||||
|
@ -89,13 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ifeq ($(OSNAME), AIX)
|
|
||||||
#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_power10.c
|
ZGEMMKERNEL = zgemm_kernel_power10.c
|
||||||
else
|
|
||||||
#ZGEMMKERNEL = zgemm_kernel_power10.S
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_power10.c
|
|
||||||
endif
|
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
|
|
|
@ -507,12 +507,13 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||||
* GEMM Kernel
|
* GEMM Kernel
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
int
|
int
|
||||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
|
|
||||||
FLOAT * C, BLASLONG ldc
|
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
, BLASLONG offset
|
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
|
||||||
|
#else
|
||||||
|
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||||
#endif
|
#endif
|
||||||
)
|
|
||||||
{
|
{
|
||||||
BLASLONG i1, i, l, temp;
|
BLASLONG i1, i, l, temp;
|
||||||
FLOAT *AO, *BO, *CO;
|
FLOAT *AO, *BO, *CO;
|
||||||
|
@ -529,8 +530,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
FLOAT *res, tr[64], ti[64];
|
FLOAT *res, tr[64], ti[64];
|
||||||
res = (FLOAT *) result;
|
res = (FLOAT *) result;
|
||||||
|
|
||||||
for (i1 = 0; i1 < (n >> 2); i1++)
|
for (i1 = 0; i1 < (n >> 2); i1++) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
off = offset;
|
off = offset;
|
||||||
#endif
|
#endif
|
||||||
|
@ -538,8 +538,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
CO = C;
|
CO = C;
|
||||||
C += ldc << 3;
|
C += ldc << 3;
|
||||||
|
|
||||||
for (i = 0; i < (m >> 3); i++)
|
for (i = 0; i < (m >> 3); i++) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (8, 4);
|
REFRESH_POINTERS (8, 4);
|
||||||
#else
|
#else
|
||||||
|
@ -547,8 +546,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < temp; ++l)
|
for (l = 0; l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||||
|
@ -572,8 +570,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (8, 4)
|
REFRESH_AFTER_SAVE (8, 4)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 4)
|
if (m & 4) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (4, 4);
|
REFRESH_POINTERS (4, 4);
|
||||||
#else
|
#else
|
||||||
|
@ -581,8 +578,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~1)); l+=2)
|
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
||||||
|
@ -600,8 +596,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
|
__builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
|
||||||
__builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
|
__builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~1)); l < temp; ++l)
|
for (l = (temp & (~1)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||||
|
@ -620,8 +615,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (4, 4)
|
REFRESH_AFTER_SAVE (4, 4)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 2)
|
if (m & 2) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (2, 4);
|
REFRESH_POINTERS (2, 4);
|
||||||
#else
|
#else
|
||||||
|
@ -629,8 +623,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~3)); l+=4)
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
||||||
|
@ -652,8 +645,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
|
__builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
|
||||||
__builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
|
__builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~3)); l < temp; ++l)
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||||
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
|
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
|
||||||
|
@ -668,8 +660,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (2, 4)
|
REFRESH_AFTER_SAVE (2, 4)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 1)
|
if (m & 1) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (1, 4)
|
REFRESH_POINTERS (1, 4)
|
||||||
#else
|
#else
|
||||||
|
@ -677,8 +668,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~3)); l+=4)
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
||||||
|
@ -700,8 +690,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
|
__builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
|
||||||
__builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
|
__builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~3)); l < temp; ++l)
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
vec_t rowB1 = *(vec_t *) & BO[l<<3];
|
||||||
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
|
vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
|
||||||
|
@ -723,8 +712,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
B += k << 3;
|
B += k << 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n & 2)
|
if (n & 2) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
off = offset;
|
off = offset;
|
||||||
#endif
|
#endif
|
||||||
|
@ -732,8 +720,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
CO = C;
|
CO = C;
|
||||||
C += ldc << 2;
|
C += ldc << 2;
|
||||||
|
|
||||||
for (i = 0; i < (m >> 3); i++)
|
for (i = 0; i < (m >> 3); i++) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (8, 2)
|
REFRESH_POINTERS (8, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -741,8 +728,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~1)); l+=2)
|
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||||
|
@ -762,8 +748,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
|
__builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
|
||||||
__builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
|
__builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~1)); l < temp; ++l)
|
for (l = (temp & (~1)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||||
|
@ -782,8 +767,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (8, 2)
|
REFRESH_AFTER_SAVE (8, 2)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 4)
|
if (m & 4) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (4, 2)
|
REFRESH_POINTERS (4, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -791,8 +775,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~3)); l+=4)
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
||||||
|
@ -814,8 +797,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
|
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
|
||||||
__builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
|
__builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~3)); l < temp; ++l)
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
@ -829,8 +811,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_AFTER_SAVE (4, 2)
|
REFRESH_AFTER_SAVE (4, 2)
|
||||||
#endif
|
#endif
|
||||||
} if (m & 2)
|
}
|
||||||
{
|
if (m & 2) {
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (2, 2)
|
REFRESH_POINTERS (2, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -838,8 +820,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~7)); l+=8)
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
||||||
|
@ -865,8 +846,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
|
__builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
|
__builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~7)); l < temp; ++l)
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||||
|
@ -879,18 +859,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (2, 2)
|
REFRESH_AFTER_SAVE (2, 2)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 1)
|
if (m & 1) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (1, 2)
|
REFRESH_POINTERS (1, 2)
|
||||||
#else
|
#else
|
||||||
BO = B;
|
BO = B;
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
// RIP OUT MMA STUFF!
|
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~7)); l+=8)
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
||||||
|
@ -916,8 +893,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
||||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~7)); l < temp; ++l)
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||||
|
@ -936,8 +912,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
B += k << 2;
|
B += k << 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n & 1)
|
if (n & 1) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
off = offset;
|
off = offset;
|
||||||
#endif
|
#endif
|
||||||
|
@ -945,8 +920,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
CO = C;
|
CO = C;
|
||||||
C += ldc << 1;
|
C += ldc << 1;
|
||||||
|
|
||||||
for (i = 0; i < (m >> 3); i++)
|
for (i = 0; i < (m >> 3); i++) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (8, 1)
|
REFRESH_POINTERS (8, 1)
|
||||||
#else
|
#else
|
||||||
|
@ -954,8 +928,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~1)); l+=2)
|
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||||
|
@ -975,8 +948,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
|
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
|
||||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
|
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~1)); l < temp; ++l)
|
for (l = (temp & (~1)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
vec_t rowA1 = *(vec_t *) & AO[l<<4];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
|
||||||
|
@ -995,8 +967,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (8, 1)
|
REFRESH_AFTER_SAVE (8, 1)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 4)
|
if (m & 4) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (4, 1)
|
REFRESH_POINTERS (4, 1)
|
||||||
#else
|
#else
|
||||||
|
@ -1004,8 +975,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~3)); l+=4)
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
|
||||||
|
@ -1027,8 +997,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
|
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
|
||||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
|
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~3)); l < temp; ++l)
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
vec_t rowA1 = *(vec_t *) & AO[l<<3];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
@ -1043,18 +1012,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (4, 1)
|
REFRESH_AFTER_SAVE (4, 1)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 2)
|
if (m & 2) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (2, 1)
|
REFRESH_POINTERS (2, 1)
|
||||||
#else
|
#else
|
||||||
BO = B;
|
BO = B;
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
// RIP OUT MMA STUFF!
|
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~7)); l+=8)
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
|
||||||
|
@ -1080,8 +1046,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
||||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~7)); l < temp; ++l)
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
vec_t rowA1 = *(vec_t *) & AO[l<<2];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||||
|
@ -1094,18 +1059,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (2, 1)
|
REFRESH_AFTER_SAVE (2, 1)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 1)
|
if (m & 1) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (1, 1)
|
REFRESH_POINTERS (1, 1)
|
||||||
#else
|
#else
|
||||||
BO = B;
|
BO = B;
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
// RIP OUT MMA STUFF!
|
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~7)); l+=8)
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||||
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
|
||||||
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
|
||||||
|
@ -1131,8 +1093,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
__builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
|
||||||
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
__builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~7)); l < temp; ++l)
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
{
|
|
||||||
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
vec_t rowA1 = *(vec_t *) & AO[l<<1];
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
__builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
|
||||||
|
|
|
@ -276,12 +276,13 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||||
* GEMM Kernel
|
* GEMM Kernel
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
int
|
int
|
||||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
|
|
||||||
FLOAT * C, BLASLONG ldc
|
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
, BLASLONG offset
|
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
|
||||||
|
#else
|
||||||
|
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||||
#endif
|
#endif
|
||||||
)
|
|
||||||
{
|
{
|
||||||
BLASLONG i1, i, l, temp;
|
BLASLONG i1, i, l, temp;
|
||||||
FLOAT *AO, *BO, *CO;
|
FLOAT *AO, *BO, *CO;
|
||||||
|
@ -297,16 +298,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
FLOAT *res, tr[16], ti[16];
|
FLOAT *res, tr[16], ti[16];
|
||||||
res = (FLOAT *) result;
|
res = (FLOAT *) result;
|
||||||
|
|
||||||
for (i1 = 0; i1 < (n >> 1); i1++)
|
for (i1 = 0; i1 < (n >> 1); i1++) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
off = offset;
|
off = offset;
|
||||||
#endif
|
#endif
|
||||||
AO = A;
|
AO = A;
|
||||||
CO = C;
|
CO = C;
|
||||||
C += ldc<<2;
|
C += ldc<<2;
|
||||||
for (i = 0; i < (m >> 3); i++)
|
for (i = 0; i < (m >> 3); i++) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (8, 2)
|
REFRESH_POINTERS (8, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -314,8 +313,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < temp; ++l)
|
for (l = 0; l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||||
|
@ -395,8 +393,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (8, 2)
|
REFRESH_AFTER_SAVE (8, 2)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 4)
|
if (m & 4) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (4, 2)
|
REFRESH_POINTERS (4, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -404,8 +401,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~1)); l+=2)
|
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||||
|
@ -423,8 +419,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
|
__builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
|
||||||
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
__builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~1)); l < temp; ++l)
|
for (l = (temp & (~1)); l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
|
@ -443,8 +438,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (4, 2)
|
REFRESH_AFTER_SAVE (4, 2)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 2)
|
if (m & 2) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (2, 2)
|
REFRESH_POINTERS (2, 2)
|
||||||
#else
|
#else
|
||||||
|
@ -452,8 +446,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~3)); l+=4)
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||||
|
@ -475,8 +468,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||||
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~3)); l < temp; ++l)
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
@ -491,18 +483,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (2, 2)
|
REFRESH_AFTER_SAVE (2, 2)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 1)
|
if (m & 1) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (1, 2)
|
REFRESH_POINTERS (1, 2)
|
||||||
#else
|
#else
|
||||||
BO = B;
|
BO = B;
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
// RIP OUT MMA STUFF!
|
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~3)); l+=4)
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||||
|
@ -524,8 +513,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
__builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
|
||||||
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
__builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~3)); l < temp; ++l)
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
vec_t rowB1 = *(vec_t *) & BO[l<<2];
|
||||||
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
|
||||||
|
@ -545,16 +533,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
#endif
|
#endif
|
||||||
B += k << 2;
|
B += k << 2;
|
||||||
}
|
}
|
||||||
if (n & 1)
|
if (n & 1) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||||
off = offset;
|
off = offset;
|
||||||
#endif
|
#endif
|
||||||
AO = A;
|
AO = A;
|
||||||
CO = C;
|
CO = C;
|
||||||
C += ldc<<1;
|
C += ldc<<1;
|
||||||
for (i = 0; i < (m >> 3); i++)
|
for (i = 0; i < (m >> 3); i++) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (8, 1)
|
REFRESH_POINTERS (8, 1)
|
||||||
#else
|
#else
|
||||||
|
@ -562,8 +548,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~1)); l+=2)
|
for (l = 0; l < (temp & (~1)); l+=2) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||||
|
@ -583,8 +568,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
|
__builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
|
||||||
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
|
__builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~1)); l < temp; ++l)
|
for (l = (temp & (~1)); l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
|
||||||
|
@ -604,8 +588,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (8, 1)
|
REFRESH_AFTER_SAVE (8, 1)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 4)
|
if (m & 4) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (4, 1)
|
REFRESH_POINTERS (4, 1)
|
||||||
#else
|
#else
|
||||||
|
@ -613,8 +596,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~3)); l+=4)
|
for (l = 0; l < (temp & (~3)); l+=4) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
|
||||||
|
@ -636,8 +618,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
|
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
|
||||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
|
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~3)); l < temp; ++l)
|
for (l = (temp & (~3)); l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
|
@ -651,8 +632,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_AFTER_SAVE (4, 1)
|
REFRESH_AFTER_SAVE (4, 1)
|
||||||
#endif
|
#endif
|
||||||
} if (m & 2)
|
}
|
||||||
{
|
if (m & 2) {
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (2, 1)
|
REFRESH_POINTERS (2, 1)
|
||||||
#else
|
#else
|
||||||
|
@ -660,8 +641,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~7)); l+=8)
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
|
||||||
|
@ -687,8 +667,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~7)); l < temp; ++l)
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
@ -701,18 +680,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
REFRESH_AFTER_SAVE (2, 1)
|
REFRESH_AFTER_SAVE (2, 1)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
if (m & 1)
|
if (m & 1) {
|
||||||
{
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_POINTERS (1, 1)
|
REFRESH_POINTERS (1, 1)
|
||||||
#else
|
#else
|
||||||
BO = B;
|
BO = B;
|
||||||
temp = k;
|
temp = k;
|
||||||
#endif
|
#endif
|
||||||
// RIP OUT MMA STUFF!
|
|
||||||
SET_ACC_ZERO()
|
SET_ACC_ZERO()
|
||||||
for (l = 0; l < (temp & (~7)); l+=8)
|
for (l = 0; l < (temp & (~7)); l+=8) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
__vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
|
||||||
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
__vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
|
||||||
|
@ -738,8 +714,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
|
||||||
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
__builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
|
||||||
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
__builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
|
||||||
}
|
}
|
||||||
for (l = (temp & (~7)); l < temp; ++l)
|
for (l = (temp & (~7)); l < temp; ++l) {
|
||||||
{
|
|
||||||
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
__vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
|
||||||
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
vec_t rowB1 = *(vec_t *) & BO[l<<1];
|
||||||
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
__builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
|
||||||
|
|
Loading…
Reference in New Issue