diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index b4c7a5e41..28c39051f 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -34,12 +34,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power10.c -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = dgemm_ncopy_4_power8.S -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = dgemm_ncopy_8_power10.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -69,7 +69,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b2a29140e..b531799a6 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif ) { - BLASLONG N = n; BLASLONG i1; #if defined(TRMMKERNEL) BLASLONG off; @@ -158,10 +157,221 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, off = -offset; #endif v4sf_t valpha = { alpha, alpha }; - N = n >> 2; - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j, temp; + BLASLONG j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + for (j = 0; j < (m >> 3); j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rb = (vec_t *) & BO[0]; + __vector_pair rowB, rowB1; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 3]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + SAVE_ACC (&acc4, 4); + SAVE_ACC1 (&acc5, 4); + SAVE_ACC (&acc6, 6); + SAVE_ACC1 (&acc7, 6); + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + if (m & 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 2]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + if (m & 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 1]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + if (m & 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] }; + v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] }; + v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] }; + v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA * rowB1; + t2 += rowA * rowB2; + t3 += rowA * rowB3; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; + CO[4 * ldc] = t2[0]; + CO[5 * ldc] = t2[1]; + CO[6 * ldc] = t3[0]; + CO[7 * ldc] = t3[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; + CO[4 * ldc] += t2[0]; + CO[5 * ldc] += t2[1]; + CO[6 * ldc] += t3[0]; + CO[7 * ldc] += t3[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + B += k << 3; + } + if (n & 4) + { + BLASLONG j, temp; FLOAT *CO; FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) @@ -172,71 +382,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; - for (j = 0; j < i; j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 4); -#else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - BLASLONG l = 0; - PREFETCH1 (CO, 0); - PREFETCH1 (CO + ldc, 0); - PREFETCH1 (CO + ldc + ldc, 0); - PREFETCH1 (CO + ldc + ldc + ldc, 0); - PREFETCH1 (CO, 128); - PREFETCH1 (CO + ldc, 128); - PREFETCH1 (CO + ldc + ldc, 128); - PREFETCH1 (CO + ldc + ldc + ldc, 128); - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 4]; - rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); - } - SAVE_ACC (&acc0, 0); - SAVE_ACC (&acc2, 4); - SAVE_ACC (&acc1, 2); - SAVE_ACC (&acc3, 6); - SAVE_ACC (&acc4, 8); - SAVE_ACC (&acc6, 12); - SAVE_ACC (&acc5, 10); - SAVE_ACC (&acc7, 14); - AO += temp << 4; - BO += temp << 2; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 4) -#endif - CO += 16; - } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 3); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 4) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 4) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 4) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif B += k << 2; } - N = (n & 3) >> 1; - for (i1 = 0; i1 < N; i1++) + if (n & 2) { - BLASLONG i, j, temp; + BLASLONG j, temp; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc << 1; AO = A; - i = m >> 4; - for (j = 0; j < i; j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 2); -#else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[0]; - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); - for (l = 1; l < temp; l++) - { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - rowA = (vec_t *) & AO[l << 4]; - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); - } - SAVE2x4_ACC (&acc0, 0); - SAVE2x4_ACC (&acc1, 2); - SAVE2x4_ACC (&acc2, 4); - SAVE2x4_ACC (&acc3, 6); - SAVE2x4_ACC (&acc4, 8); - SAVE2x4_ACC (&acc5, 10); - SAVE2x4_ACC (&acc6, 12); - SAVE2x4_ACC (&acc7, 14); - CO += 16; - AO += temp << 4; - BO += temp << 1; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 2) -#endif - } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 3); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 2) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 2) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 2) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif B += k << 1; } - N = (n & 1) >> 0; - for (i1 = 0; i1 < N; i1++) + if (n & 1) { BLASLONG i, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc; AO = A; - i = m; - while (i >= 16) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 1) -#else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - v4sf_t t1 = { 0, 0 }; - v4sf_t t2 = { 0, 0 }; - v4sf_t t3 = { 0, 0 }; - v4sf_t t4 = { 0, 0 }; - v4sf_t t5 = { 0, 0 }; - v4sf_t t6 = { 0, 0 }; - v4sf_t t7 = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowB = { BO[l], BO[l] }; - v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; - v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; - v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; - v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; - v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; - v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; - v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; - v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; - t += rowA * rowB; - t1 += rowA1 * rowB; - t2 += rowA2 * rowB; - t3 += rowA3 * rowB; - t4 += rowA4 * rowB; - t5 += rowA5 * rowB; - t6 += rowA6 * rowB; - t7 += rowA7 * rowB; - } - t = t * valpha; - t1 = t1 * valpha; - t2 = t2 * valpha; - t3 = t3 * valpha; - t4 = t4 * valpha; - t5 = t5 * valpha; - t6 = t6 * valpha; - t7 = t7 * valpha; -#if defined(TRMMKERNEL) - CO[0] = t[0]; - CO[1] = t[1]; - CO[2] = t1[0]; - CO[3] = t1[1]; - CO[4] = t2[0]; - CO[5] = t2[1]; - CO[6] = t3[0]; - CO[7] = t3[1]; - CO[8] = t4[0]; - CO[9] = t4[1]; - CO[10] = t5[0]; - CO[11] = t5[1]; - CO[12] = t6[0]; - CO[13] = t6[1]; - CO[14] = t7[0]; - CO[15] = t7[1]; -#else - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t1[0]; - CO[3] += t1[1]; - CO[4] += t2[0]; - CO[5] += t2[1]; - CO[6] += t3[0]; - CO[7] += t3[1]; - CO[8] += t4[0]; - CO[9] += t4[1]; - CO[10] += t5[0]; - CO[11] += t5[1]; - CO[12] += t6[0]; - CO[13] += t6[1]; - CO[14] += t7[0]; - CO[15] += t7[1]; -#endif - AO += temp << 4; - BO += temp; - CO += 16; - i -= 16; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 1) -#endif - } - while (i >= 8) + for (i = 0; i < (m >> 3); i++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 3; BO += temp; CO += 8; - i -= 8; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (8, 1) #endif } - while (i >= 4) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 2; BO += temp; CO += 4; - i -= 4; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (4, 1) #endif } - while (i >= 2) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 1; BO += temp; CO += 2; - i -= 2; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (2, 1) #endif } - while (i >= 1) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO[0] += t * alpha; #endif CO += 1; - i -= 1; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (1, 1) #endif diff --git a/kernel/power/dgemm_ncopy_8_power10.c b/kernel/power/dgemm_ncopy_8_power10.c new file mode 100644 index 000000000..9836c2e7f --- /dev/null +++ b/kernel/power/dgemm_ncopy_8_power10.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include +#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp09, ctemp17, ctemp33; + IFLOAT ctemp25, ctemp41; + IFLOAT ctemp49, ctemp57; + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + PREFETCHA (aoffset1, 384); + PREFETCHA (aoffset2, 384); + PREFETCHA (aoffset3, 384); + PREFETCHA (aoffset4, 384); + PREFETCHA (aoffset5, 384); + PREFETCHA (aoffset6, 384); + PREFETCHA (aoffset7, 384); + PREFETCHA (aoffset8, 384); + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset1 + 2); + __vector double va2 = *(__vector double*)(aoffset1 + 4); + __vector double va3 = *(__vector double*)(aoffset1 + 6); + + __vector double va4 = *(__vector double*)(aoffset2 + 0); + __vector double va5 = *(__vector double*)(aoffset2 + 2); + __vector double va6 = *(__vector double*)(aoffset2 + 4); + __vector double va7 = *(__vector double*)(aoffset2 + 6); + + __vector double va8 = *(__vector double*)(aoffset3 + 0); + __vector double va9 = *(__vector double*)(aoffset3 + 2); + __vector double va10 = *(__vector double*)(aoffset3 + 4); + __vector double va11 = *(__vector double*)(aoffset3 + 6); + + __vector double va12 = *(__vector double*)(aoffset4 + 0); + __vector double va13 = *(__vector double*)(aoffset4 + 2); + __vector double va14 = *(__vector double*)(aoffset4 + 4); + __vector double va15 = *(__vector double*)(aoffset4 + 6); + + __vector double va16 = *(__vector double*)(aoffset5 + 0); + __vector double va17 = *(__vector double*)(aoffset5 + 2); + __vector double va18 = *(__vector double*)(aoffset5 + 4); + __vector double va19 = *(__vector double*)(aoffset5 + 6); + + __vector double va20 = *(__vector double*)(aoffset6 + 0); + __vector double va21 = *(__vector double*)(aoffset6 + 2); + __vector double va22 = *(__vector double*)(aoffset6 + 4); + __vector double va23 = *(__vector double*)(aoffset6 + 6); + + __vector double va24 = *(__vector double*)(aoffset7 + 0); + __vector double va25 = *(__vector double*)(aoffset7 + 2); + __vector double va26 = *(__vector double*)(aoffset7 + 4); + __vector double va27 = *(__vector double*)(aoffset7 + 6); + + __vector double va28 = *(__vector double*)(aoffset8 + 0); + __vector double va29 = *(__vector double*)(aoffset8 + 2); + __vector double va30 = *(__vector double*)(aoffset8 + 4); + __vector double va31 = *(__vector double*)(aoffset8 + 6); + + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0); + *(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0); + *(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0); + *(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3); + *(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3); + *(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3); + *(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3); + + *(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0); + *(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0); + *(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0); + *(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0); + *(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3); + *(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3); + *(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3); + *(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3); + + *(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0); + *(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0); + *(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0); + *(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0); + *(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3); + *(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3); + *(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3); + *(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3); + + *(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0); + *(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0); + *(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0); + *(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0); + *(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3); + *(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3); + *(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3); + *(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3); + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + PREFETCHA (aoffset1, 384); + PREFETCHA (aoffset2, 384); + PREFETCHA (aoffset3, 384); + PREFETCHA (aoffset4, 384); + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset1 + 2); + __vector double va2 = *(__vector double*)(aoffset2 + 0); + __vector double va3 = *(__vector double*)(aoffset2 + 2); + __vector double va4 = *(__vector double*)(aoffset3 + 0); + __vector double va5 = *(__vector double*)(aoffset3 + 2); + __vector double va6 = *(__vector double*)(aoffset4 + 0); + __vector double va7 = *(__vector double*)(aoffset4 + 2); + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0); + *(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3); + *(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3); + *(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0); + *(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0); + *(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3); + *(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset2 + 0); + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3); + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 9fbf84695..80f495f70 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif ) { - BLASLONG N = n; BLASLONG i1; #if defined(TRMMKERNEL) BLASLONG off; @@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif v4sf_t valpha = { alpha, alpha, alpha, alpha }; - N = n >> 3; - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j, temp; + BLASLONG j, temp; FLOAT *CO; FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) @@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 4); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif CO += 16; } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 8) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 8) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 8) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 3; } - N = (n & 7) >> 2; - for (i1 = 0; i1 < N; i1++) + if (n & 4) { BLASLONG i, j, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (16, 4) #endif } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 4) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 4) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 4) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 2; } - N = (n & 3) >> 1; - for (i1 = 0; i1 < N; i1++) + if (n & 2) { BLASLONG i, j, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (16, 2) #endif } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; v4sf_t *rowC; @@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 2) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; v4sf_t *rowC; @@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 2) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; BLASLONG l = 0; @@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 2) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; BLASLONG l = 0; @@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 1; } - N = (n & 1) >> 0; - for (i1 = 0; i1 < N; i1++) + if (n & 1) { BLASLONG i, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc; AO = A; - i = m; - while (i >= 16) + for (i = 0; i < (m >> 4); i++) { FLOAT *BO; BLASLONG l = 0; @@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 4; BO += temp; CO += 16; - i -= 16; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (16, 1) #endif } - while (i >= 8) + if (m & 8) { FLOAT *BO; BLASLONG l = 0; @@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 3; BO += temp; CO += 8; - i -= 8; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (8, 1) #endif } - while (i >= 4) + if (m & 4) { FLOAT *BO; BLASLONG l = 0; @@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 2; BO += temp; CO += 4; - i -= 4; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (4, 1) #endif } - while (i >= 2) + if (m & 2) { FLOAT *BO; BLASLONG l = 0; @@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 1; BO += temp; CO += 2; - i -= 2; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (2, 1) #endif } - while (i >= 1) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO[0] += t * alpha; #endif CO += 1; - i -= 1; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (1, 1) #endif diff --git a/param.h b/param.h index f3ddde6a1..2047e4776 100644 --- a/param.h +++ b/param.h @@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMM_DEFAULT_P 832 #define SBGEMM_DEFAULT_Q 1026 #define SBGEMM_DEFAULT_R 4096 +#undef DGEMM_DEFAULT_UNROLL_M +#undef DGEMM_DEFAULT_UNROLL_N +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 #endif #if defined(SPARC) && defined(V7)