From 55b244ca0da907b27c4e0306df0a1a90a2238c6a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 12 Oct 2018 09:30:35 +0000 Subject: [PATCH] enable the SGEMM/SKX C based kernel In QA the final bug was found so now the sklyakex sgemm C based kernel can be activated.... --- kernel/x86_64/KERNEL.SKYLAKEX | 9 +- kernel/x86_64/sgemm_beta_skylakex.c | 6 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 549 --------------------- 3 files changed, 10 insertions(+), 554 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 48c81e80b..acc6356d6 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,6 +1,11 @@ include $(KERNELDIR)/KERNEL.HASWELL -SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c + +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_skylakex.c +SGEMMONCOPY = sgemm_ncopy_4_skylakex.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c @@ -9,5 +14,5 @@ DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c -SGEMM_BETA = ../generic/gemm_beta.c +SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index b1bf4d77a..54f9664e9 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -60,8 +60,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, if (beta == ZERO){ __m512 z_zero; + __m256 y_zero; z_zero = _mm512_setzero_ps(); + y_zero = _mm256_setzero_ps(); j = n; do { c_offset1 = c_offset; @@ -71,14 +73,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, while (i > 32) { _mm512_storeu_ps(c_offset1, z_zero); - _mm512_storeu_ps(c_offset1 + 8, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); - _mm512_storeu_ps(c_offset1 + 24 , z_zero); c_offset1 += 32; i -= 32; } while (i > 8) { - _mm512_storeu_ps(c_offset1, z_zero); + _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8; } diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index b2b1ab03f..10d3d22ed 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#define INIT32x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row0b = _mm512_setzero_ps(); \ - row1b = _mm512_setzero_ps(); \ - row2b = _mm512_setzero_ps(); \ - row3b = _mm512_setzero_ps(); \ - row4b = _mm512_setzero_ps(); \ - row5b = _mm512_setzero_ps(); \ - row6b = _mm512_setzero_ps(); \ - row7b = _mm512_setzero_ps(); \ - -#define KERNEL32x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm0b = _mm512_loadu_ps(AOb); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - row0b += zmm0b * zmm2; \ - row1b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - row2b += zmm0b * zmm2; \ - row3b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - row4b += zmm0b * zmm2; \ - row5b += zmm0b * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - row6b += zmm0b * zmm2; \ - row7b += zmm0b * zmm3; \ - BO += 8; \ - AO += 16; \ - AOb += 16; - - -#define SAVE32x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0b *= zmm0; \ - row1b *= zmm0; \ - row2b *= zmm0; \ - row3b *= zmm0; \ - row4b *= zmm0; \ - row5b *= zmm0; \ - row6b *= zmm0; \ - row7b *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); \ - row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ - row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ - row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ - row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ - row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ - row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ - row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ - row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ - _mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ - _mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ - _mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ - _mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ - _mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ - _mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ - _mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ - _mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ - - -#define INIT16x8() \ - row0 = _mm512_setzero_ps(); \ - row1 = _mm512_setzero_ps(); \ - row2 = _mm512_setzero_ps(); \ - row3 = _mm512_setzero_ps(); \ - row4 = _mm512_setzero_ps(); \ - row5 = _mm512_setzero_ps(); \ - row6 = _mm512_setzero_ps(); \ - row7 = _mm512_setzero_ps(); \ - -#define KERNEL16x8_SUB() \ - zmm0 = _mm512_loadu_ps(AO); \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += zmm0 * zmm2; \ - row1 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += zmm0 * zmm2; \ - row3 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += zmm0 * zmm2; \ - row5 += zmm0 * zmm3; \ - zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ - zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += zmm0 * zmm2; \ - row7 += zmm0 * zmm3; \ - BO += 8; \ - AO += 16; - - -#define SAVE16x8(ALPHA) \ - zmm0 = _mm512_set1_ps(ALPHA); \ - row0 *= zmm0; \ - row1 *= zmm0; \ - row2 *= zmm0; \ - row3 *= zmm0; \ - row4 *= zmm0; \ - row5 *= zmm0; \ - row6 *= zmm0; \ - row7 *= zmm0; \ - row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ - _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm512_storeu_ps(CO1 + 7 * ldc, row7); - - - -/*******************************************************************************************/ - -#define INIT8x8() \ - row0 = _mm256_setzero_ps(); \ - row1 = _mm256_setzero_ps(); \ - row2 = _mm256_setzero_ps(); \ - row3 = _mm256_setzero_ps(); \ - row4 = _mm256_setzero_ps(); \ - row5 = _mm256_setzero_ps(); \ - row6 = _mm256_setzero_ps(); \ - row7 = _mm256_setzero_ps(); \ - -#define KERNEL8x8_SUB() \ - ymm0 = _mm256_loadu_ps(AO); \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += ymm0 * ymm2; \ - row1 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += ymm0 * ymm2; \ - row3 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += ymm0 * ymm2; \ - row5 += ymm0 * ymm3; \ - ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ - ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += ymm0 * ymm2; \ - row7 += ymm0 * ymm3; \ - BO += 8; \ - AO += 8; - - -#define SAVE8x8(ALPHA) \ - ymm0 = _mm256_set1_ps(ALPHA); \ - row0 *= ymm0; \ - row1 *= ymm0; \ - row2 *= ymm0; \ - row3 *= ymm0; \ - row4 *= ymm0; \ - row5 *= ymm0; \ - row6 *= ymm0; \ - row7 *= ymm0; \ - row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ - _mm256_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm256_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm256_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm256_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm256_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm256_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm256_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm256_storeu_ps(CO1 + 7 * ldc, row7); \ - - - -/*******************************************************************************************/ - -#define INIT4x8() \ - row0 = _mm_setzero_ps(); \ - row1 = _mm_setzero_ps(); \ - row2 = _mm_setzero_ps(); \ - row3 = _mm_setzero_ps(); \ - row4 = _mm_setzero_ps(); \ - row5 = _mm_setzero_ps(); \ - row6 = _mm_setzero_ps(); \ - row7 = _mm_setzero_ps(); \ - - -#define KERNEL4x8_SUB() \ - xmm0 = _mm_loadu_ps(AO); \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ - xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 4; - - -#define SAVE4x8(ALPHA) \ - xmm0 = _mm_set1_ps(ALPHA); \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ - row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ - row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ - row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ - row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ - row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ - row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ - row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ - _mm_storeu_ps(CO1 + 0 * ldc, row0); \ - _mm_storeu_ps(CO1 + 1 * ldc, row1); \ - _mm_storeu_ps(CO1 + 2 * ldc, row2); \ - _mm_storeu_ps(CO1 + 3 * ldc, row3); \ - _mm_storeu_ps(CO1 + 4 * ldc, row4); \ - _mm_storeu_ps(CO1 + 5 * ldc, row5); \ - _mm_storeu_ps(CO1 + 6 * ldc, row6); \ - _mm_storeu_ps(CO1 + 7 * ldc, row7); \ - - -/*******************************************************************************************/ - -#define INIT2x8() \ - row0a = row0b = 0; \ - row1a = row1b = 0; \ - row2a = row2b = 0; \ - row3a = row3b = 0; \ - row4a = row4b = 0; \ - row5a = row5b = 0; \ - row6a = row6b = 0; \ - row7a = row7b = 0; \ - -#define KERNEL2x8_SUB() \ - xmm0 = *(AO); \ - xmm1 = *(AO + 1); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0a += xmm0 * xmm2; \ - row0b += xmm1 * xmm2; \ - row1a += xmm0 * xmm3; \ - row1b += xmm1 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2a += xmm0 * xmm2; \ - row2b += xmm1 * xmm2; \ - row3a += xmm0 * xmm3; \ - row3b += xmm1 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4a += xmm0 * xmm2; \ - row4b += xmm1 * xmm2; \ - row5a += xmm0 * xmm3; \ - row5b += xmm1 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6a += xmm0 * xmm2; \ - row6b += xmm1 * xmm2; \ - row7a += xmm0 * xmm3; \ - row7b += xmm1 * xmm3; \ - BO += 8; \ - AO += 2; - - -#define SAVE2x8(ALPHA) \ - xmm0 = ALPHA; \ - row0a *= xmm0; \ - row0b *= xmm0; \ - row1a *= xmm0; \ - row1b *= xmm0; \ - row2a *= xmm0; \ - row2b *= xmm0; \ - row3a *= xmm0; \ - row3b *= xmm0; \ - row4a *= xmm0; \ - row4b *= xmm0; \ - row5a *= xmm0; \ - row5b *= xmm0; \ - row6a *= xmm0; \ - row6b *= xmm0; \ - row7a *= xmm0; \ - row7b *= xmm0; \ - *(CO1 + 0 * ldc + 0) += row0a; \ - *(CO1 + 0 * ldc + 1) += row0b; \ - *(CO1 + 1 * ldc + 0) += row1a; \ - *(CO1 + 1 * ldc + 1) += row1b; \ - *(CO1 + 2 * ldc + 0) += row2a; \ - *(CO1 + 2 * ldc + 1) += row2b; \ - *(CO1 + 3 * ldc + 0) += row3a; \ - *(CO1 + 3 * ldc + 1) += row3b; \ - *(CO1 + 4 * ldc + 0) += row4a; \ - *(CO1 + 4 * ldc + 1) += row4b; \ - *(CO1 + 5 * ldc + 0) += row5a; \ - *(CO1 + 5 * ldc + 1) += row5b; \ - *(CO1 + 6 * ldc + 0) += row6a; \ - *(CO1 + 6 * ldc + 1) += row6b; \ - *(CO1 + 7 * ldc + 0) += row7a; \ - *(CO1 + 7 * ldc + 1) += row7b; \ - - - -/*******************************************************************************************/ - -#define INIT1x8() \ - row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; - -#define KERNEL1x8_SUB() \ - xmm0 = *(AO ); \ - xmm2 = *(BO + 0); \ - xmm3 = *(BO + 1); \ - row0 += xmm0 * xmm2; \ - row1 += xmm0 * xmm3; \ - xmm2 = *(BO + 2); \ - xmm3 = *(BO + 3); \ - row2 += xmm0 * xmm2; \ - row3 += xmm0 * xmm3; \ - xmm2 = *(BO + 4); \ - xmm3 = *(BO + 5); \ - row4 += xmm0 * xmm2; \ - row5 += xmm0 * xmm3; \ - xmm2 = *(BO + 6); \ - xmm3 = *(BO + 7); \ - row6 += xmm0 * xmm2; \ - row7 += xmm0 * xmm3; \ - BO += 8; \ - AO += 1; - - -#define SAVE1x8(ALPHA) \ - xmm0 = ALPHA; \ - row0 *= xmm0; \ - row1 *= xmm0; \ - row2 *= xmm0; \ - row3 *= xmm0; \ - row4 *= xmm0; \ - row5 *= xmm0; \ - row6 *= xmm0; \ - row7 *= xmm0; \ - *(CO1 + 0 * ldc) += row0; \ - *(CO1 + 1 * ldc) += row1; \ - *(CO1 + 2 * ldc) += row2; \ - *(CO1 + 3 * ldc) += row3; \ - *(CO1 + 4 * ldc) += row4; \ - *(CO1 + 5 * ldc) += row5; \ - *(CO1 + 6 * ldc) += row6; \ - *(CO1 + 7 * ldc) += row7; \ @@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; - - // L8_0 - while (N >= 8 && 0) { - float *CO1; - float *AO; - int i; - // L8_10 - CO1 = C; - C += 8 * ldc; - - AO = A; - - i = m; - - while (i >= 32 && 0) { - float *BO, *AOb; - // L8_11 - __m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - int kloop = K; - AOb = AO + 16 * K; - - INIT32x8() - - while (kloop > 0) { - // L12_17 - KERNEL32x8_SUB() - kloop--; - } - // L8_19 - SAVE32x8(alpha) - CO1 += 32; - AO += 16 * K; - - i -= 32; - } - while (i >= 16) { - float *BO; - // L8_11 - __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT16x8() - - while (kloop > 0) { - KERNEL16x8_SUB() - kloop--; - } - SAVE16x8(alpha) - CO1 += 16; - - i -= 16; - } - while (i >= 8) { - float *BO; - // L8_11 - __m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT8x8() - - while (kloop > 0) { - // L12_17 - KERNEL8x8_SUB() - kloop--; - } - // L8_19 - SAVE8x8(alpha) - CO1 += 8; - - i -= 8; - } - while (i >= 4) { - // L8_11 - float *BO; - __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - BO = B; - int kloop = K; - - INIT4x8() - // L8_16 - while (kloop > 0) { - // L12_17 - KERNEL4x8_SUB() - kloop--; - } - // L8_19 - SAVE4x8(alpha) - CO1 += 4; - - i -= 4; - } - -/************************************************************************** -* Rest of M -***************************************************************************/ - - while (i >= 2) { - float *BO; - float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; - BO = B; - - INIT2x8() - int kloop = K; - - while (kloop > 0) { - KERNEL2x8_SUB() - kloop--; - } - SAVE2x8(alpha) - CO1 += 2; - i -= 2; - } - // L13_40 - while (i >= 1) { - float *BO; - float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; - int kloop = K; - BO = B; - INIT1x8() - - while (kloop > 0) { - KERNEL1x8_SUB() - kloop--; - } - SAVE1x8(alpha) - CO1 += 1; - i -= 1; - } - - B += K * 8; - N -= 8; - } - while (N >= 4) { float *CO1; float *AO;