From d4bad73834a9e1abf23e3c0a8f4e9a84e9137881 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 10 Oct 2018 01:49:22 +0000 Subject: [PATCH] Add a C+intrinsics version of the SGEMM/skylakex kernel for most sizes this is 1.2x to 1.4x faster than the current code --- kernel/x86_64/sgemm_beta_skylakex.c | 150 ++ kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 1726 ++++++++++++++++++++ kernel/x86_64/sgemm_ncopy_4_skylakex.c | 207 +++ kernel/x86_64/sgemm_tcopy_16_skylakex.c | 387 +++++ 4 files changed, 2470 insertions(+) create mode 100644 kernel/x86_64/sgemm_beta_skylakex.c create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.c create mode 100644 kernel/x86_64/sgemm_ncopy_4_skylakex.c create mode 100644 kernel/x86_64/sgemm_tcopy_16_skylakex.c diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c new file mode 100644 index 000000000..b1bf4d77a --- /dev/null +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#include + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + /* fast path.. just zero the whole matrix */ + if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) { + memset(c, 0, m * n * sizeof(FLOAT)); + return 0; + } + + + c_offset = c; + + if (beta == ZERO){ + __m512 z_zero; + + z_zero = _mm512_setzero_ps(); + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = m; + + while (i > 32) { + _mm512_storeu_ps(c_offset1, z_zero); + _mm512_storeu_ps(c_offset1 + 8, z_zero); + _mm512_storeu_ps(c_offset1 + 16, z_zero); + _mm512_storeu_ps(c_offset1 + 24 , z_zero); + c_offset1 += 32; + i -= 32; + } + while (i > 8) { + _mm512_storeu_ps(c_offset1, z_zero); + c_offset1 += 8; + i -= 8; + } + + while (i > 0) { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c new file mode 100644 index 000000000..b2b1ab03f --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -0,0 +1,1726 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/* comment below left for history, data does not represent the implementation in this file */ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#include "common.h" +#include + + + +/******************************************************************************************* +* 8 lines of N +*******************************************************************************************/ + + + +#define INIT32x8() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row4 = _mm512_setzero_ps(); \ + row5 = _mm512_setzero_ps(); \ + row6 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row4b = _mm512_setzero_ps(); \ + row5b = _mm512_setzero_ps(); \ + row6b = _mm512_setzero_ps(); \ + row7b = _mm512_setzero_ps(); \ + +#define KERNEL32x8_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm0b = _mm512_loadu_ps(AOb); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm0b * zmm2; \ + row1b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm0b * zmm2; \ + row3b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += zmm0 * zmm2; \ + row5 += zmm0 * zmm3; \ + row4b += zmm0b * zmm2; \ + row5b += zmm0b * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += zmm0 * zmm2; \ + row7 += zmm0 * zmm3; \ + row6b += zmm0b * zmm2; \ + row7b += zmm0b * zmm3; \ + BO += 8; \ + AO += 16; \ + AOb += 16; + + +#define SAVE32x8(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row4 *= zmm0; \ + row5 *= zmm0; \ + row6 *= zmm0; \ + row7 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row4b *= zmm0; \ + row5b *= zmm0; \ + row6b *= zmm0; \ + row7b *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm512_storeu_ps(CO1 + 7 * ldc, row7); \ + row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ + row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ + row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ + row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ + row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ + _mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ + _mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ + _mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ + _mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ + _mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ + + +#define INIT16x8() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row4 = _mm512_setzero_ps(); \ + row5 = _mm512_setzero_ps(); \ + row6 = _mm512_setzero_ps(); \ + row7 = _mm512_setzero_ps(); \ + +#define KERNEL16x8_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += zmm0 * zmm2; \ + row5 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += zmm0 * zmm2; \ + row7 += zmm0 * zmm3; \ + BO += 8; \ + AO += 16; + + +#define SAVE16x8(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row4 *= zmm0; \ + row5 *= zmm0; \ + row6 *= zmm0; \ + row7 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm512_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm512_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm512_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm512_storeu_ps(CO1 + 7 * ldc, row7); + + + +/*******************************************************************************************/ + +#define INIT8x8() \ + row0 = _mm256_setzero_ps(); \ + row1 = _mm256_setzero_ps(); \ + row2 = _mm256_setzero_ps(); \ + row3 = _mm256_setzero_ps(); \ + row4 = _mm256_setzero_ps(); \ + row5 = _mm256_setzero_ps(); \ + row6 = _mm256_setzero_ps(); \ + row7 = _mm256_setzero_ps(); \ + +#define KERNEL8x8_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += ymm0 * ymm2; \ + row1 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += ymm0 * ymm2; \ + row3 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += ymm0 * ymm2; \ + row5 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += ymm0 * ymm2; \ + row7 += ymm0 * ymm3; \ + BO += 8; \ + AO += 8; + + +#define SAVE8x8(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + row0 *= ymm0; \ + row1 *= ymm0; \ + row2 *= ymm0; \ + row3 *= ymm0; \ + row4 *= ymm0; \ + row5 *= ymm0; \ + row6 *= ymm0; \ + row7 *= ymm0; \ + row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ + _mm256_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm256_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm256_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm256_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm256_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm256_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm256_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm256_storeu_ps(CO1 + 7 * ldc, row7); \ + + + +/*******************************************************************************************/ + +#define INIT4x8() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + row2 = _mm_setzero_ps(); \ + row3 = _mm_setzero_ps(); \ + row4 = _mm_setzero_ps(); \ + row5 = _mm_setzero_ps(); \ + row6 = _mm_setzero_ps(); \ + row7 = _mm_setzero_ps(); \ + + +#define KERNEL4x8_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ + row4 += xmm0 * xmm2; \ + row5 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ + row6 += xmm0 * xmm2; \ + row7 += xmm0 * xmm3; \ + BO += 8; \ + AO += 4; + + +#define SAVE4x8(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row4 *= xmm0; \ + row5 *= xmm0; \ + row6 *= xmm0; \ + row7 *= xmm0; \ + row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ + row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ + row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ + row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ + row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ + _mm_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm_storeu_ps(CO1 + 3 * ldc, row3); \ + _mm_storeu_ps(CO1 + 4 * ldc, row4); \ + _mm_storeu_ps(CO1 + 5 * ldc, row5); \ + _mm_storeu_ps(CO1 + 6 * ldc, row6); \ + _mm_storeu_ps(CO1 + 7 * ldc, row7); \ + + +/*******************************************************************************************/ + +#define INIT2x8() \ + row0a = row0b = 0; \ + row1a = row1b = 0; \ + row2a = row2b = 0; \ + row3a = row3b = 0; \ + row4a = row4b = 0; \ + row5a = row5b = 0; \ + row6a = row6b = 0; \ + row7a = row7b = 0; \ + +#define KERNEL2x8_SUB() \ + xmm0 = *(AO); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0a += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1a += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2a += xmm0 * xmm2; \ + row2b += xmm1 * xmm2; \ + row3a += xmm0 * xmm3; \ + row3b += xmm1 * xmm3; \ + xmm2 = *(BO + 4); \ + xmm3 = *(BO + 5); \ + row4a += xmm0 * xmm2; \ + row4b += xmm1 * xmm2; \ + row5a += xmm0 * xmm3; \ + row5b += xmm1 * xmm3; \ + xmm2 = *(BO + 6); \ + xmm3 = *(BO + 7); \ + row6a += xmm0 * xmm2; \ + row6b += xmm1 * xmm2; \ + row7a += xmm0 * xmm3; \ + row7b += xmm1 * xmm3; \ + BO += 8; \ + AO += 2; + + +#define SAVE2x8(ALPHA) \ + xmm0 = ALPHA; \ + row0a *= xmm0; \ + row0b *= xmm0; \ + row1a *= xmm0; \ + row1b *= xmm0; \ + row2a *= xmm0; \ + row2b *= xmm0; \ + row3a *= xmm0; \ + row3b *= xmm0; \ + row4a *= xmm0; \ + row4b *= xmm0; \ + row5a *= xmm0; \ + row5b *= xmm0; \ + row6a *= xmm0; \ + row6b *= xmm0; \ + row7a *= xmm0; \ + row7b *= xmm0; \ + *(CO1 + 0 * ldc + 0) += row0a; \ + *(CO1 + 0 * ldc + 1) += row0b; \ + *(CO1 + 1 * ldc + 0) += row1a; \ + *(CO1 + 1 * ldc + 1) += row1b; \ + *(CO1 + 2 * ldc + 0) += row2a; \ + *(CO1 + 2 * ldc + 1) += row2b; \ + *(CO1 + 3 * ldc + 0) += row3a; \ + *(CO1 + 3 * ldc + 1) += row3b; \ + *(CO1 + 4 * ldc + 0) += row4a; \ + *(CO1 + 4 * ldc + 1) += row4b; \ + *(CO1 + 5 * ldc + 0) += row5a; \ + *(CO1 + 5 * ldc + 1) += row5b; \ + *(CO1 + 6 * ldc + 0) += row6a; \ + *(CO1 + 6 * ldc + 1) += row6b; \ + *(CO1 + 7 * ldc + 0) += row7a; \ + *(CO1 + 7 * ldc + 1) += row7b; \ + + + +/*******************************************************************************************/ + +#define INIT1x8() \ + row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; + +#define KERNEL1x8_SUB() \ + xmm0 = *(AO ); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + xmm2 = *(BO + 4); \ + xmm3 = *(BO + 5); \ + row4 += xmm0 * xmm2; \ + row5 += xmm0 * xmm3; \ + xmm2 = *(BO + 6); \ + xmm3 = *(BO + 7); \ + row6 += xmm0 * xmm2; \ + row7 += xmm0 * xmm3; \ + BO += 8; \ + AO += 1; + + +#define SAVE1x8(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row4 *= xmm0; \ + row5 *= xmm0; \ + row6 *= xmm0; \ + row7 *= xmm0; \ + *(CO1 + 0 * ldc) += row0; \ + *(CO1 + 1 * ldc) += row1; \ + *(CO1 + 2 * ldc) += row2; \ + *(CO1 + 3 * ldc) += row3; \ + *(CO1 + 4 * ldc) += row4; \ + *(CO1 + 5 * ldc) += row5; \ + *(CO1 + 6 * ldc) += row6; \ + *(CO1 + 7 * ldc) += row7; \ + + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +#define INIT64x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + row0d = _mm512_setzero_ps(); \ + row1d = _mm512_setzero_ps(); \ + row2d = _mm512_setzero_ps(); \ + row3d = _mm512_setzero_ps(); \ + +#define KERNEL64x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm7 = _mm512_loadu_ps(A3); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + row0d += zmm7 * zmm2; \ + row1d += zmm7 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + row2d += zmm7 * zmm2; \ + row3d += zmm7 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; \ + A3 += 16; \ + + +#define SAVE64x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0d *= zmm0; \ + row1d *= zmm0; \ + row2d *= zmm0; \ + row3d *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); \ + row0d += _mm512_loadu_ps(CO1 + 0*ldc + 48); \ + row1d += _mm512_loadu_ps(CO1 + 1*ldc + 48); \ + row2d += _mm512_loadu_ps(CO1 + 2*ldc + 48); \ + row3d += _mm512_loadu_ps(CO1 + 3*ldc + 48); \ + _mm512_storeu_ps(CO1 + 0*ldc + 48, row0d); \ + _mm512_storeu_ps(CO1 + 1*ldc + 48, row1d); \ + _mm512_storeu_ps(CO1 + 2*ldc + 48, row2d); \ + _mm512_storeu_ps(CO1 + 3*ldc + 48, row3d); + + +#define INIT48x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + row0c = _mm512_setzero_ps(); \ + row1c = _mm512_setzero_ps(); \ + row2c = _mm512_setzero_ps(); \ + row3c = _mm512_setzero_ps(); \ + +#define KERNEL48x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm5 = _mm512_loadu_ps(A2); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + row0c += zmm5 * zmm2; \ + row1c += zmm5 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + row2c += zmm5 * zmm2; \ + row3c += zmm5 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; \ + A2 += 16; + + +#define SAVE48x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0c *= zmm0; \ + row1c *= zmm0; \ + row2c *= zmm0; \ + row3c *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \ + row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \ + row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \ + row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \ + row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \ + _mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \ + _mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \ + _mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \ + _mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); + + +#define INIT32x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + row0b = _mm512_setzero_ps(); \ + row1b = _mm512_setzero_ps(); \ + row2b = _mm512_setzero_ps(); \ + row3b = _mm512_setzero_ps(); \ + +#define KERNEL32x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm1 = _mm512_loadu_ps(A1); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + row0b += zmm1 * zmm2; \ + row1b += zmm1 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + row2b += zmm1 * zmm2; \ + row3b += zmm1 * zmm3; \ + BO += 4; \ + AO += 16; \ + A1 += 16; + + +#define SAVE32x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0b *= zmm0; \ + row1b *= zmm0; \ + row2b *= zmm0; \ + row3b *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0*ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1*ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2*ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3*ldc); \ + _mm512_storeu_ps(CO1 + 0*ldc, row0); \ + _mm512_storeu_ps(CO1 + 1*ldc, row1); \ + _mm512_storeu_ps(CO1 + 2*ldc, row2); \ + _mm512_storeu_ps(CO1 + 3*ldc, row3); \ + row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \ + row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \ + row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \ + row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \ + _mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \ + _mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \ + _mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \ + _mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); + + + +#define INIT16x4() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + row2 = _mm512_setzero_ps(); \ + row3 = _mm512_setzero_ps(); \ + +#define KERNEL16x4_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \ + row2 += zmm0 * zmm2; \ + row3 += zmm0 * zmm3; \ + BO += 4; \ + AO += 16; + + +#define SAVE16x4(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row2 *= zmm0; \ + row3 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ + _mm512_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm512_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm512_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm512_storeu_ps(CO1 + 3 * ldc, row3); + + + +/*******************************************************************************************/ + +#define INIT8x4() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + ymm8 = _mm256_setzero_ps(); \ + ymm10 = _mm256_setzero_ps(); \ + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ + ymm8 += ymm0 * ymm2; \ + ymm10 += ymm0 * ymm3; \ + BO += 4; \ + AO += 8; + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm8 *= ymm0; \ + ymm10 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1 + 0 * ldc); \ + ymm6 += _mm256_loadu_ps(CO1 + 1 * ldc); \ + ymm8 += _mm256_loadu_ps(CO1 + 2 * ldc); \ + ymm10 += _mm256_loadu_ps(CO1 + 3 * ldc); \ + _mm256_storeu_ps(CO1 + 0 * ldc, ymm4); \ + _mm256_storeu_ps(CO1 + 1 * ldc, ymm6); \ + _mm256_storeu_ps(CO1 + 2 * ldc, ymm8); \ + _mm256_storeu_ps(CO1 + 3 * ldc, ymm10); \ + + + +/*******************************************************************************************/ + +#define INIT4x4() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + row2 = _mm_setzero_ps(); \ + row3 = _mm_setzero_ps(); \ + + +#define KERNEL4x4_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 4; + + +#define SAVE4x4(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ + row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ + row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ + row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ + _mm_storeu_ps(CO1 + 0 * ldc, row0); \ + _mm_storeu_ps(CO1 + 1 * ldc, row1); \ + _mm_storeu_ps(CO1 + 2 * ldc, row2); \ + _mm_storeu_ps(CO1 + 3 * ldc, row3); \ + + +/*******************************************************************************************/ + +#define INIT2x4() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + row2 = 0; row2b = 0; row3 = 0; row3b = 0; + +#define KERNEL2x4_SUB() \ + xmm0 = *(AO); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row2b += xmm1 * xmm2; \ + row3 += xmm0 * xmm3; \ + row3b += xmm1 * xmm3; \ + BO += 4; \ + AO += 2; + + +#define SAVE2x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + row2 *= xmm0; \ + row2b *= xmm0; \ + row3 *= xmm0; \ + row3b *= xmm0; \ + *(CO1 + 0 * ldc + 0) += row0; \ + *(CO1 + 0 * ldc + 1) += row0b; \ + *(CO1 + 1 * ldc + 0) += row1; \ + *(CO1 + 1 * ldc + 1) += row1b; \ + *(CO1 + 2 * ldc + 0) += row2; \ + *(CO1 + 2 * ldc + 1) += row2b; \ + *(CO1 + 3 * ldc + 0) += row3; \ + *(CO1 + 3 * ldc + 1) += row3b; \ + + + +/*******************************************************************************************/ + +#define INIT1x4() \ + row0 = 0; row1 = 0; row2 = 0; row3 = 0; +#define KERNEL1x4_SUB() \ + xmm0 = *(AO ); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + xmm2 = *(BO + 2); \ + xmm3 = *(BO + 3); \ + row2 += xmm0 * xmm2; \ + row3 += xmm0 * xmm3; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row2 *= xmm0; \ + row3 *= xmm0; \ + *(CO1 + 0 * ldc) += row0; \ + *(CO1 + 1 * ldc) += row1; \ + *(CO1 + 2 * ldc) += row2; \ + *(CO1 + 3 * ldc) += row3; \ + + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define INIT16x2() \ + row0 = _mm512_setzero_ps(); \ + row1 = _mm512_setzero_ps(); \ + + +#define KERNEL16x2_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += zmm0 * zmm2; \ + row1 += zmm0 * zmm3; \ + BO += 2; \ + AO += 16; + + +#define SAVE16x2(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row1 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + row1 += _mm512_loadu_ps(CO1 + ldc); \ + _mm512_storeu_ps(CO1 , row0); \ + _mm512_storeu_ps(CO1 + ldc, row1); \ + + + + +/*******************************************************************************************/ + +#define INIT8x2() \ + ymm4 = _mm256_setzero_ps(); \ + ymm6 = _mm256_setzero_ps(); \ + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ + ymm4 += ymm0 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + BO += 2; \ + AO += 8; + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm6 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + ymm6 += _mm256_loadu_ps(CO1 + ldc); \ + _mm256_storeu_ps(CO1 , ymm4); \ + _mm256_storeu_ps(CO1 + ldc, ymm6); \ + + + +/*******************************************************************************************/ + +#define INIT4x2() \ + row0 = _mm_setzero_ps(); \ + row1 = _mm_setzero_ps(); \ + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 4; + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row1 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + row1 += _mm_loadu_ps(CO1 + ldc); \ + _mm_storeu_ps(CO1 , row0); \ + _mm_storeu_ps(CO1 + ldc, row1); \ + + + +/*******************************************************************************************/ + + +#define INIT2x2() \ + row0 = 0; row0b = 0; row1 = 0; row1b = 0; \ + +#define KERNEL2x2_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + row1 += xmm0 * xmm3; \ + row1b += xmm1 * xmm3; \ + BO += 2; \ + AO += 2; \ + + +#define SAVE2x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + row1 *= xmm0; \ + row1b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + *(CO1 + ldc ) += row1; \ + *(CO1 + ldc +1) += row1b; \ + + +/*******************************************************************************************/ + +#define INIT1x2() \ + row0 = 0; row1 = 0; + +#define KERNEL1x2_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO + 0); \ + xmm3 = *(BO + 1); \ + row0 += xmm0 * xmm2; \ + row1 += xmm0 * xmm3; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row1 *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 + ldc ) += row1; \ + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define INIT16x1() \ + row0 = _mm512_setzero_ps(); \ + +#define KERNEL16x1_SUB() \ + zmm0 = _mm512_loadu_ps(AO); \ + zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += zmm0 * zmm2; \ + BO += 1; \ + AO += 16; + + +#define SAVE16x1(ALPHA) \ + zmm0 = _mm512_set1_ps(ALPHA); \ + row0 *= zmm0; \ + row0 += _mm512_loadu_ps(CO1); \ + _mm512_storeu_ps(CO1 , row0); \ + + +/*******************************************************************************************/ + +#define INIT8x1() \ + ymm4 = _mm256_setzero_ps(); + +#define KERNEL8x1_SUB() \ + ymm0 = _mm256_loadu_ps(AO); \ + ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + ymm0 = _mm256_set1_ps(ALPHA); \ + ymm4 *= ymm0; \ + ymm4 += _mm256_loadu_ps(CO1); \ + _mm256_storeu_ps(CO1 , ymm4); \ + + +/*******************************************************************************************/ + +#define INIT4x1() \ + row0 = _mm_setzero_ps(); \ + +#define KERNEL4x1_SUB() \ + xmm0 = _mm_loadu_ps(AO); \ + xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + xmm0 = _mm_set1_ps(ALPHA); \ + row0 *= xmm0; \ + row0 += _mm_loadu_ps(CO1); \ + _mm_storeu_ps(CO1 , row0); \ + + + +/*******************************************************************************************/ + +#define INIT2x1() \ + row0 = 0; row0b = 0; + +#define KERNEL2x1_SUB() \ + xmm0 = *(AO + 0); \ + xmm1 = *(AO + 1); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + row0b += xmm1 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + row0b *= xmm0; \ + *(CO1 ) += row0; \ + *(CO1 +1 ) += row0b; \ + + +/*******************************************************************************************/ + +#define INIT1x1() \ + row0 = 0; + +#define KERNEL1x1_SUB() \ + xmm0 = *(AO); \ + xmm2 = *(BO); \ + row0 += xmm0 * xmm2; \ + BO += 1; \ + AO += 1; + + +#define SAVE1x1(ALPHA) \ + xmm0 = ALPHA; \ + row0 *= xmm0; \ + *(CO1 ) += row0; \ + + +/*******************************************************************************************/ + + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG ldc) +{ + unsigned long M = m, N = n, K = k; + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + + + // L8_0 + while (N >= 8 && 0) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 8 * ldc; + + AO = A; + + i = m; + + while (i >= 32 && 0) { + float *BO, *AOb; + // L8_11 + __m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; + BO = B; + int kloop = K; + AOb = AO + 16 * K; + + INIT32x8() + + while (kloop > 0) { + // L12_17 + KERNEL32x8_SUB() + kloop--; + } + // L8_19 + SAVE32x8(alpha) + CO1 += 32; + AO += 16 * K; + + i -= 32; + } + while (i >= 16) { + float *BO; + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT16x8() + + while (kloop > 0) { + KERNEL16x8_SUB() + kloop--; + } + SAVE16x8(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + // L8_11 + __m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT8x8() + + while (kloop > 0) { + // L12_17 + KERNEL8x8_SUB() + kloop--; + } + // L8_19 + SAVE8x8(alpha) + CO1 += 8; + + i -= 8; + } + while (i >= 4) { + // L8_11 + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; + BO = B; + int kloop = K; + + INIT4x8() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x8_SUB() + kloop--; + } + // L8_19 + SAVE4x8(alpha) + CO1 += 4; + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; + BO = B; + + INIT2x8() + int kloop = K; + + while (kloop > 0) { + KERNEL2x8_SUB() + kloop--; + } + SAVE2x8(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; + int kloop = K; + BO = B; + INIT1x8() + + while (kloop > 0) { + KERNEL1x8_SUB() + kloop--; + } + SAVE1x8(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 8; + N -= 8; + } + + while (N >= 4) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A; + + i = m; + while (i >= 64) { + float *BO; + float *A1, *A2, *A3; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, zmm5, row1, zmm7, row2, row3, row0b, row1b, row2b, row3b, row0c, row1c, row2c, row3c, row0d, row1d, row2d, row3d; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + A2 = A1 + 16 * K; + A3 = A2 + 16 * K; + + INIT64x4() + + while (kloop > 0) { + // L12_17 + KERNEL64x4_SUB() + kloop--; + } + // L8_19 + SAVE64x4(alpha) + CO1 += 64; + AO += 48 * K; + + i -= 64; + } + while (i >= 32) { + float *BO; + float *A1; + // L8_11 + __m512 zmm0, zmm1, zmm2, zmm3, row0, row1, row2, row3, row0b, row1b, row2b, row3b; + BO = B; + int kloop = K; + + A1 = AO + 16 * K; + + INIT32x4() + + while (kloop > 0) { + // L12_17 + KERNEL32x4_SUB() + kloop--; + } + // L8_19 + SAVE32x4(alpha) + CO1 += 32; + AO += 16 * K; + + i -= 32; + } + while (i >= 16) { + float *BO; + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT16x4() + + while (kloop > 0) { + // L12_17 + KERNEL16x4_SUB() + kloop--; + } + // L8_19 + SAVE16x4(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + // L8_11 + __m256 ymm0, ymm2, ymm3, ymm4, ymm6,ymm8,ymm10; + BO = B; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + CO1 += 8; + + i -= 8; + } + while (i >= 4) { + // L8_11 + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1, row2, row3; + BO = B; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + CO1 += 4; + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b, row2, row2b, row3, row3b; + BO = B; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1, row2, row3; + int kloop = K; + BO = B; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + float *CO1; + float *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + + // L8_11 + __m512 zmm0, zmm2, zmm3, row0, row1; + BO = B; + int kloop = K; + + INIT16x2() + + while (kloop > 0) { + // L12_17 + KERNEL16x2_SUB() + kloop--; + } + // L8_19 + SAVE16x2(alpha) + CO1 += 16; + + i -= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm3, ymm4, ymm6; + // L8_11 + BO = B; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + CO1 += 8; + + i-=8; + } + + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, xmm3, row0, row1; + // L8_11 + BO = B; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + CO1 += 4; + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b; + int kloop = K; + BO = B; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, xmm3, row0, row1; + int kloop = K; + BO = B; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + float *CO1; + float *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A; + + i = m; + while (i >= 16) { + float *BO; + __m512 zmm0, zmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT16x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL16x1_SUB() + kloop--; + } + // L8_19 + SAVE16x1(alpha) + CO1 += 16; + + i-= 16; + } + while (i >= 8) { + float *BO; + __m256 ymm0, ymm2, ymm4; + // L8_11 + BO = B; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + CO1 += 8; + + i-= 8; + } + while (i >= 4) { + float *BO; + __m128 xmm0, xmm2, row0; + // L8_11 + BO = B; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + CO1 += 4; + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + float *BO; + float xmm0, xmm1, xmm2, row0, row0b; + int kloop = K; + BO = B; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + CO1 += 2; + i -= 2; + } + // L13_40 + while (i >= 1) { + float *BO; + float xmm0, xmm2, row0; + int kloop = K; + + BO = B; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + CO1 += 1; + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} diff --git a/kernel/x86_64/sgemm_ncopy_4_skylakex.c b/kernel/x86_64/sgemm_ncopy_4_skylakex.c new file mode 100644 index 000000000..8577e3b38 --- /dev/null +++ b/kernel/x86_64/sgemm_ncopy_4_skylakex.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#include + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + __m128 row0, row1, row2, row3; + + row0 = _mm_loadu_ps(a_offset1); + row1 = _mm_loadu_ps(a_offset2); + row2 = _mm_loadu_ps(a_offset3); + row3 = _mm_loadu_ps(a_offset4); + + _MM_TRANSPOSE4_PS(row0, row1, row2, row3); + + _mm_storeu_ps(b_offset + 0, row0); + _mm_storeu_ps(b_offset + 4, row1); + _mm_storeu_ps(b_offset + 8, row2); + _mm_storeu_ps(b_offset + 12, row3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/x86_64/sgemm_tcopy_16_skylakex.c b/kernel/x86_64/sgemm_tcopy_16_skylakex.c new file mode 100644 index 000000000..dbacc5081 --- /dev/null +++ b/kernel/x86_64/sgemm_tcopy_16_skylakex.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __restrict b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +}