OpenBLAS has a fancy algorithm for copying the input data while laying it out in a more CPU friendly memory layout. This is great for large matrixes; the cost of the copy is easily ammortized by the gains from the better memory layout. But for small matrixes (on CPUs that can do efficient unaligned loads) this copy can be a net loss. This patch adds (for SKYLAKEX initially) a "sgemm direct" mode, that bypasses the whole copy machinary for ALPHA=1/BETA=0/... standard arguments, for small matrixes only. What is small? For the non-threaded case this has been measured to be in the M*N*K = 28 * 512 * 512 range, while in the threaded case it's less, around M*N*K = 1 * 512 * 512
1642 lines
41 KiB
C
1642 lines
41 KiB
C
/*********************************************************************************
|
|
Copyright (c) 2013, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
**********************************************************************************/
|
|
|
|
|
|
/* comment below left for history, data does not represent the implementation in this file */
|
|
|
|
/*********************************************************************
|
|
* 2014/07/28 Saar
|
|
* BLASTEST : OK
|
|
* CTEST : OK
|
|
* TEST : OK
|
|
*
|
|
* 2013/10/28 Saar
|
|
* Parameter:
|
|
* SGEMM_DEFAULT_UNROLL_N 4
|
|
* SGEMM_DEFAULT_UNROLL_M 16
|
|
* SGEMM_DEFAULT_P 768
|
|
* SGEMM_DEFAULT_Q 384
|
|
* A_PR1 512
|
|
* B_PR1 512
|
|
*
|
|
*
|
|
* 2014/07/28 Saar
|
|
* Performance at 9216x9216x9216:
|
|
* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83)
|
|
* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155)
|
|
* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230)
|
|
* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267)
|
|
*
|
|
*********************************************************************/
|
|
|
|
#include "common.h"
|
|
#include <immintrin.h>
|
|
|
|
|
|
|
|
/*******************************************************************************************
|
|
* 8 lines of N
|
|
*******************************************************************************************/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************************
|
|
* 4 lines of N
|
|
*******************************************************************************************/
|
|
|
|
#define INIT64x4() \
|
|
row0 = _mm512_setzero_ps(); \
|
|
row1 = _mm512_setzero_ps(); \
|
|
row2 = _mm512_setzero_ps(); \
|
|
row3 = _mm512_setzero_ps(); \
|
|
row0b = _mm512_setzero_ps(); \
|
|
row1b = _mm512_setzero_ps(); \
|
|
row2b = _mm512_setzero_ps(); \
|
|
row3b = _mm512_setzero_ps(); \
|
|
row0c = _mm512_setzero_ps(); \
|
|
row1c = _mm512_setzero_ps(); \
|
|
row2c = _mm512_setzero_ps(); \
|
|
row3c = _mm512_setzero_ps(); \
|
|
row0d = _mm512_setzero_ps(); \
|
|
row1d = _mm512_setzero_ps(); \
|
|
row2d = _mm512_setzero_ps(); \
|
|
row3d = _mm512_setzero_ps(); \
|
|
|
|
#define KERNEL64x4_SUB() \
|
|
zmm0 = _mm512_loadu_ps(AO); \
|
|
zmm1 = _mm512_loadu_ps(A1); \
|
|
zmm5 = _mm512_loadu_ps(A2); \
|
|
zmm7 = _mm512_loadu_ps(A3); \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \
|
|
row0 += zmm0 * zmm2; \
|
|
row1 += zmm0 * zmm3; \
|
|
row0b += zmm1 * zmm2; \
|
|
row1b += zmm1 * zmm3; \
|
|
row0c += zmm5 * zmm2; \
|
|
row1c += zmm5 * zmm3; \
|
|
row0d += zmm7 * zmm2; \
|
|
row1d += zmm7 * zmm3; \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \
|
|
row2 += zmm0 * zmm2; \
|
|
row3 += zmm0 * zmm3; \
|
|
row2b += zmm1 * zmm2; \
|
|
row3b += zmm1 * zmm3; \
|
|
row2c += zmm5 * zmm2; \
|
|
row3c += zmm5 * zmm3; \
|
|
row2d += zmm7 * zmm2; \
|
|
row3d += zmm7 * zmm3; \
|
|
BO += 4; \
|
|
AO += 16; \
|
|
A1 += 16; \
|
|
A2 += 16; \
|
|
A3 += 16; \
|
|
|
|
|
|
#define SAVE64x4(ALPHA) \
|
|
zmm0 = _mm512_set1_ps(ALPHA); \
|
|
row0 *= zmm0; \
|
|
row1 *= zmm0; \
|
|
row2 *= zmm0; \
|
|
row3 *= zmm0; \
|
|
row0b *= zmm0; \
|
|
row1b *= zmm0; \
|
|
row2b *= zmm0; \
|
|
row3b *= zmm0; \
|
|
row0c *= zmm0; \
|
|
row1c *= zmm0; \
|
|
row2c *= zmm0; \
|
|
row3c *= zmm0; \
|
|
row0d *= zmm0; \
|
|
row1d *= zmm0; \
|
|
row2d *= zmm0; \
|
|
row3d *= zmm0; \
|
|
row0 += _mm512_loadu_ps(CO1 + 0*ldc); \
|
|
row1 += _mm512_loadu_ps(CO1 + 1*ldc); \
|
|
row2 += _mm512_loadu_ps(CO1 + 2*ldc); \
|
|
row3 += _mm512_loadu_ps(CO1 + 3*ldc); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc, row0); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc, row1); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc, row2); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc, row3); \
|
|
row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \
|
|
row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \
|
|
row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \
|
|
row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \
|
|
row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \
|
|
row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \
|
|
row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \
|
|
row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc + 32, row3c); \
|
|
row0d += _mm512_loadu_ps(CO1 + 0*ldc + 48); \
|
|
row1d += _mm512_loadu_ps(CO1 + 1*ldc + 48); \
|
|
row2d += _mm512_loadu_ps(CO1 + 2*ldc + 48); \
|
|
row3d += _mm512_loadu_ps(CO1 + 3*ldc + 48); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc + 48, row0d); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc + 48, row1d); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc + 48, row2d); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc + 48, row3d);
|
|
|
|
|
|
#define INIT48x4() \
|
|
row0 = _mm512_setzero_ps(); \
|
|
row1 = _mm512_setzero_ps(); \
|
|
row2 = _mm512_setzero_ps(); \
|
|
row3 = _mm512_setzero_ps(); \
|
|
row0b = _mm512_setzero_ps(); \
|
|
row1b = _mm512_setzero_ps(); \
|
|
row2b = _mm512_setzero_ps(); \
|
|
row3b = _mm512_setzero_ps(); \
|
|
row0c = _mm512_setzero_ps(); \
|
|
row1c = _mm512_setzero_ps(); \
|
|
row2c = _mm512_setzero_ps(); \
|
|
row3c = _mm512_setzero_ps(); \
|
|
|
|
#define KERNEL48x4_SUB() \
|
|
zmm0 = _mm512_loadu_ps(AO); \
|
|
zmm1 = _mm512_loadu_ps(A1); \
|
|
zmm5 = _mm512_loadu_ps(A2); \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \
|
|
row0 += zmm0 * zmm2; \
|
|
row1 += zmm0 * zmm3; \
|
|
row0b += zmm1 * zmm2; \
|
|
row1b += zmm1 * zmm3; \
|
|
row0c += zmm5 * zmm2; \
|
|
row1c += zmm5 * zmm3; \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \
|
|
row2 += zmm0 * zmm2; \
|
|
row3 += zmm0 * zmm3; \
|
|
row2b += zmm1 * zmm2; \
|
|
row3b += zmm1 * zmm3; \
|
|
row2c += zmm5 * zmm2; \
|
|
row3c += zmm5 * zmm3; \
|
|
BO += 4; \
|
|
AO += 16; \
|
|
A1 += 16; \
|
|
A2 += 16;
|
|
|
|
|
|
#define SAVE48x4(ALPHA) \
|
|
zmm0 = _mm512_set1_ps(ALPHA); \
|
|
row0 *= zmm0; \
|
|
row1 *= zmm0; \
|
|
row2 *= zmm0; \
|
|
row3 *= zmm0; \
|
|
row0b *= zmm0; \
|
|
row1b *= zmm0; \
|
|
row2b *= zmm0; \
|
|
row3b *= zmm0; \
|
|
row0c *= zmm0; \
|
|
row1c *= zmm0; \
|
|
row2c *= zmm0; \
|
|
row3c *= zmm0; \
|
|
row0 += _mm512_loadu_ps(CO1 + 0*ldc); \
|
|
row1 += _mm512_loadu_ps(CO1 + 1*ldc); \
|
|
row2 += _mm512_loadu_ps(CO1 + 2*ldc); \
|
|
row3 += _mm512_loadu_ps(CO1 + 3*ldc); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc, row0); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc, row1); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc, row2); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc, row3); \
|
|
row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \
|
|
row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \
|
|
row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \
|
|
row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc + 16, row3b); \
|
|
row0c += _mm512_loadu_ps(CO1 + 0*ldc + 32); \
|
|
row1c += _mm512_loadu_ps(CO1 + 1*ldc + 32); \
|
|
row2c += _mm512_loadu_ps(CO1 + 2*ldc + 32); \
|
|
row3c += _mm512_loadu_ps(CO1 + 3*ldc + 32); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc + 32, row0c); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc + 32, row1c); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc + 32, row2c); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc + 32, row3c);
|
|
|
|
|
|
#define INIT32x4() \
|
|
row0 = _mm512_setzero_ps(); \
|
|
row1 = _mm512_setzero_ps(); \
|
|
row2 = _mm512_setzero_ps(); \
|
|
row3 = _mm512_setzero_ps(); \
|
|
row0b = _mm512_setzero_ps(); \
|
|
row1b = _mm512_setzero_ps(); \
|
|
row2b = _mm512_setzero_ps(); \
|
|
row3b = _mm512_setzero_ps(); \
|
|
|
|
#define KERNEL32x4_SUB() \
|
|
zmm0 = _mm512_loadu_ps(AO); \
|
|
zmm1 = _mm512_loadu_ps(A1); \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \
|
|
row0 += zmm0 * zmm2; \
|
|
row1 += zmm0 * zmm3; \
|
|
row0b += zmm1 * zmm2; \
|
|
row1b += zmm1 * zmm3; \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \
|
|
row2 += zmm0 * zmm2; \
|
|
row3 += zmm0 * zmm3; \
|
|
row2b += zmm1 * zmm2; \
|
|
row3b += zmm1 * zmm3; \
|
|
BO += 4; \
|
|
AO += 16; \
|
|
A1 += 16;
|
|
|
|
|
|
#define SAVE32x4(ALPHA) \
|
|
zmm0 = _mm512_set1_ps(ALPHA); \
|
|
row0 *= zmm0; \
|
|
row1 *= zmm0; \
|
|
row2 *= zmm0; \
|
|
row3 *= zmm0; \
|
|
row0b *= zmm0; \
|
|
row1b *= zmm0; \
|
|
row2b *= zmm0; \
|
|
row3b *= zmm0; \
|
|
row0 += _mm512_loadu_ps(CO1 + 0*ldc); \
|
|
row1 += _mm512_loadu_ps(CO1 + 1*ldc); \
|
|
row2 += _mm512_loadu_ps(CO1 + 2*ldc); \
|
|
row3 += _mm512_loadu_ps(CO1 + 3*ldc); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc, row0); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc, row1); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc, row2); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc, row3); \
|
|
row0b += _mm512_loadu_ps(CO1 + 0*ldc + 16); \
|
|
row1b += _mm512_loadu_ps(CO1 + 1*ldc + 16); \
|
|
row2b += _mm512_loadu_ps(CO1 + 2*ldc + 16); \
|
|
row3b += _mm512_loadu_ps(CO1 + 3*ldc + 16); \
|
|
_mm512_storeu_ps(CO1 + 0*ldc + 16, row0b); \
|
|
_mm512_storeu_ps(CO1 + 1*ldc + 16, row1b); \
|
|
_mm512_storeu_ps(CO1 + 2*ldc + 16, row2b); \
|
|
_mm512_storeu_ps(CO1 + 3*ldc + 16, row3b);
|
|
|
|
|
|
|
|
#define INIT16x4() \
|
|
row0 = _mm512_setzero_ps(); \
|
|
row1 = _mm512_setzero_ps(); \
|
|
row2 = _mm512_setzero_ps(); \
|
|
row3 = _mm512_setzero_ps(); \
|
|
|
|
#define KERNEL16x4_SUB() \
|
|
zmm0 = _mm512_loadu_ps(AO); \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+1)); \
|
|
row0 += zmm0 * zmm2; \
|
|
row1 += zmm0 * zmm3; \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO+2)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO+3)); \
|
|
row2 += zmm0 * zmm2; \
|
|
row3 += zmm0 * zmm3; \
|
|
BO += 4; \
|
|
AO += 16;
|
|
|
|
|
|
#define SAVE16x4(ALPHA) \
|
|
zmm0 = _mm512_set1_ps(ALPHA); \
|
|
row0 *= zmm0; \
|
|
row1 *= zmm0; \
|
|
row2 *= zmm0; \
|
|
row3 *= zmm0; \
|
|
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \
|
|
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \
|
|
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \
|
|
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \
|
|
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \
|
|
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \
|
|
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \
|
|
_mm512_storeu_ps(CO1 + 3 * ldc, row3);
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT8x4() \
|
|
ymm4 = _mm256_setzero_ps(); \
|
|
ymm6 = _mm256_setzero_ps(); \
|
|
ymm8 = _mm256_setzero_ps(); \
|
|
ymm10 = _mm256_setzero_ps(); \
|
|
|
|
#define KERNEL8x4_SUB() \
|
|
ymm0 = _mm256_loadu_ps(AO); \
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
|
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
|
ymm4 += ymm0 * ymm2; \
|
|
ymm6 += ymm0 * ymm3; \
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
|
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
|
ymm8 += ymm0 * ymm2; \
|
|
ymm10 += ymm0 * ymm3; \
|
|
BO += 4; \
|
|
AO += 8;
|
|
|
|
|
|
#define SAVE8x4(ALPHA) \
|
|
ymm0 = _mm256_set1_ps(ALPHA); \
|
|
ymm4 *= ymm0; \
|
|
ymm6 *= ymm0; \
|
|
ymm8 *= ymm0; \
|
|
ymm10 *= ymm0; \
|
|
ymm4 += _mm256_loadu_ps(CO1 + 0 * ldc); \
|
|
ymm6 += _mm256_loadu_ps(CO1 + 1 * ldc); \
|
|
ymm8 += _mm256_loadu_ps(CO1 + 2 * ldc); \
|
|
ymm10 += _mm256_loadu_ps(CO1 + 3 * ldc); \
|
|
_mm256_storeu_ps(CO1 + 0 * ldc, ymm4); \
|
|
_mm256_storeu_ps(CO1 + 1 * ldc, ymm6); \
|
|
_mm256_storeu_ps(CO1 + 2 * ldc, ymm8); \
|
|
_mm256_storeu_ps(CO1 + 3 * ldc, ymm10); \
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT4x4() \
|
|
row0 = _mm_setzero_ps(); \
|
|
row1 = _mm_setzero_ps(); \
|
|
row2 = _mm_setzero_ps(); \
|
|
row3 = _mm_setzero_ps(); \
|
|
|
|
|
|
#define KERNEL4x4_SUB() \
|
|
xmm0 = _mm_loadu_ps(AO); \
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
|
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
|
row0 += xmm0 * xmm2; \
|
|
row1 += xmm0 * xmm3; \
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
|
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
|
row2 += xmm0 * xmm2; \
|
|
row3 += xmm0 * xmm3; \
|
|
BO += 4; \
|
|
AO += 4;
|
|
|
|
|
|
#define SAVE4x4(ALPHA) \
|
|
xmm0 = _mm_set1_ps(ALPHA); \
|
|
row0 *= xmm0; \
|
|
row1 *= xmm0; \
|
|
row2 *= xmm0; \
|
|
row3 *= xmm0; \
|
|
row0 += _mm_loadu_ps(CO1 + 0 * ldc); \
|
|
row1 += _mm_loadu_ps(CO1 + 1 * ldc); \
|
|
row2 += _mm_loadu_ps(CO1 + 2 * ldc); \
|
|
row3 += _mm_loadu_ps(CO1 + 3 * ldc); \
|
|
_mm_storeu_ps(CO1 + 0 * ldc, row0); \
|
|
_mm_storeu_ps(CO1 + 1 * ldc, row1); \
|
|
_mm_storeu_ps(CO1 + 2 * ldc, row2); \
|
|
_mm_storeu_ps(CO1 + 3 * ldc, row3); \
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT2x4() \
|
|
row0 = 0; row0b = 0; row1 = 0; row1b = 0; \
|
|
row2 = 0; row2b = 0; row3 = 0; row3b = 0;
|
|
|
|
#define KERNEL2x4_SUB() \
|
|
xmm0 = *(AO); \
|
|
xmm1 = *(AO + 1); \
|
|
xmm2 = *(BO + 0); \
|
|
xmm3 = *(BO + 1); \
|
|
row0 += xmm0 * xmm2; \
|
|
row0b += xmm1 * xmm2; \
|
|
row1 += xmm0 * xmm3; \
|
|
row1b += xmm1 * xmm3; \
|
|
xmm2 = *(BO + 2); \
|
|
xmm3 = *(BO + 3); \
|
|
row2 += xmm0 * xmm2; \
|
|
row2b += xmm1 * xmm2; \
|
|
row3 += xmm0 * xmm3; \
|
|
row3b += xmm1 * xmm3; \
|
|
BO += 4; \
|
|
AO += 2;
|
|
|
|
|
|
#define SAVE2x4(ALPHA) \
|
|
xmm0 = ALPHA; \
|
|
row0 *= xmm0; \
|
|
row0b *= xmm0; \
|
|
row1 *= xmm0; \
|
|
row1b *= xmm0; \
|
|
row2 *= xmm0; \
|
|
row2b *= xmm0; \
|
|
row3 *= xmm0; \
|
|
row3b *= xmm0; \
|
|
*(CO1 + 0 * ldc + 0) += row0; \
|
|
*(CO1 + 0 * ldc + 1) += row0b; \
|
|
*(CO1 + 1 * ldc + 0) += row1; \
|
|
*(CO1 + 1 * ldc + 1) += row1b; \
|
|
*(CO1 + 2 * ldc + 0) += row2; \
|
|
*(CO1 + 2 * ldc + 1) += row2b; \
|
|
*(CO1 + 3 * ldc + 0) += row3; \
|
|
*(CO1 + 3 * ldc + 1) += row3b; \
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT1x4() \
|
|
row0 = 0; row1 = 0; row2 = 0; row3 = 0;
|
|
#define KERNEL1x4_SUB() \
|
|
xmm0 = *(AO ); \
|
|
xmm2 = *(BO + 0); \
|
|
xmm3 = *(BO + 1); \
|
|
row0 += xmm0 * xmm2; \
|
|
row1 += xmm0 * xmm3; \
|
|
xmm2 = *(BO + 2); \
|
|
xmm3 = *(BO + 3); \
|
|
row2 += xmm0 * xmm2; \
|
|
row3 += xmm0 * xmm3; \
|
|
BO += 4; \
|
|
AO += 1;
|
|
|
|
|
|
#define SAVE1x4(ALPHA) \
|
|
xmm0 = ALPHA; \
|
|
row0 *= xmm0; \
|
|
row1 *= xmm0; \
|
|
row2 *= xmm0; \
|
|
row3 *= xmm0; \
|
|
*(CO1 + 0 * ldc) += row0; \
|
|
*(CO1 + 1 * ldc) += row1; \
|
|
*(CO1 + 2 * ldc) += row2; \
|
|
*(CO1 + 3 * ldc) += row3; \
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
/*******************************************************************************************
|
|
* 2 lines of N
|
|
*******************************************************************************************/
|
|
|
|
#define INIT16x2() \
|
|
row0 = _mm512_setzero_ps(); \
|
|
row1 = _mm512_setzero_ps(); \
|
|
|
|
|
|
#define KERNEL16x2_SUB() \
|
|
zmm0 = _mm512_loadu_ps(AO); \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
|
row0 += zmm0 * zmm2; \
|
|
row1 += zmm0 * zmm3; \
|
|
BO += 2; \
|
|
AO += 16;
|
|
|
|
|
|
#define SAVE16x2(ALPHA) \
|
|
zmm0 = _mm512_set1_ps(ALPHA); \
|
|
row0 *= zmm0; \
|
|
row1 *= zmm0; \
|
|
row0 += _mm512_loadu_ps(CO1); \
|
|
row1 += _mm512_loadu_ps(CO1 + ldc); \
|
|
_mm512_storeu_ps(CO1 , row0); \
|
|
_mm512_storeu_ps(CO1 + ldc, row1); \
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT8x2() \
|
|
ymm4 = _mm256_setzero_ps(); \
|
|
ymm6 = _mm256_setzero_ps(); \
|
|
|
|
#define KERNEL8x2_SUB() \
|
|
ymm0 = _mm256_loadu_ps(AO); \
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \
|
|
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
|
ymm4 += ymm0 * ymm2; \
|
|
ymm6 += ymm0 * ymm3; \
|
|
BO += 2; \
|
|
AO += 8;
|
|
|
|
|
|
#define SAVE8x2(ALPHA) \
|
|
ymm0 = _mm256_set1_ps(ALPHA); \
|
|
ymm4 *= ymm0; \
|
|
ymm6 *= ymm0; \
|
|
ymm4 += _mm256_loadu_ps(CO1); \
|
|
ymm6 += _mm256_loadu_ps(CO1 + ldc); \
|
|
_mm256_storeu_ps(CO1 , ymm4); \
|
|
_mm256_storeu_ps(CO1 + ldc, ymm6); \
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT4x2() \
|
|
row0 = _mm_setzero_ps(); \
|
|
row1 = _mm_setzero_ps(); \
|
|
|
|
#define KERNEL4x2_SUB() \
|
|
xmm0 = _mm_loadu_ps(AO); \
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \
|
|
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
|
row0 += xmm0 * xmm2; \
|
|
row1 += xmm0 * xmm3; \
|
|
BO += 2; \
|
|
AO += 4;
|
|
|
|
|
|
#define SAVE4x2(ALPHA) \
|
|
xmm0 = _mm_set1_ps(ALPHA); \
|
|
row0 *= xmm0; \
|
|
row1 *= xmm0; \
|
|
row0 += _mm_loadu_ps(CO1); \
|
|
row1 += _mm_loadu_ps(CO1 + ldc); \
|
|
_mm_storeu_ps(CO1 , row0); \
|
|
_mm_storeu_ps(CO1 + ldc, row1); \
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
|
|
#define INIT2x2() \
|
|
row0 = 0; row0b = 0; row1 = 0; row1b = 0; \
|
|
|
|
#define KERNEL2x2_SUB() \
|
|
xmm0 = *(AO + 0); \
|
|
xmm1 = *(AO + 1); \
|
|
xmm2 = *(BO + 0); \
|
|
xmm3 = *(BO + 1); \
|
|
row0 += xmm0 * xmm2; \
|
|
row0b += xmm1 * xmm2; \
|
|
row1 += xmm0 * xmm3; \
|
|
row1b += xmm1 * xmm3; \
|
|
BO += 2; \
|
|
AO += 2; \
|
|
|
|
|
|
#define SAVE2x2(ALPHA) \
|
|
xmm0 = ALPHA; \
|
|
row0 *= xmm0; \
|
|
row0b *= xmm0; \
|
|
row1 *= xmm0; \
|
|
row1b *= xmm0; \
|
|
*(CO1 ) += row0; \
|
|
*(CO1 +1 ) += row0b; \
|
|
*(CO1 + ldc ) += row1; \
|
|
*(CO1 + ldc +1) += row1b; \
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT1x2() \
|
|
row0 = 0; row1 = 0;
|
|
|
|
#define KERNEL1x2_SUB() \
|
|
xmm0 = *(AO); \
|
|
xmm2 = *(BO + 0); \
|
|
xmm3 = *(BO + 1); \
|
|
row0 += xmm0 * xmm2; \
|
|
row1 += xmm0 * xmm3; \
|
|
BO += 2; \
|
|
AO += 1;
|
|
|
|
|
|
#define SAVE1x2(ALPHA) \
|
|
xmm0 = ALPHA; \
|
|
row0 *= xmm0; \
|
|
row1 *= xmm0; \
|
|
*(CO1 ) += row0; \
|
|
*(CO1 + ldc ) += row1; \
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
/*******************************************************************************************
|
|
* 1 line of N
|
|
*******************************************************************************************/
|
|
|
|
#define INIT16x1() \
|
|
row0 = _mm512_setzero_ps(); \
|
|
|
|
#define KERNEL16x1_SUB() \
|
|
zmm0 = _mm512_loadu_ps(AO); \
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO)); \
|
|
row0 += zmm0 * zmm2; \
|
|
BO += 1; \
|
|
AO += 16;
|
|
|
|
|
|
#define SAVE16x1(ALPHA) \
|
|
zmm0 = _mm512_set1_ps(ALPHA); \
|
|
row0 *= zmm0; \
|
|
row0 += _mm512_loadu_ps(CO1); \
|
|
_mm512_storeu_ps(CO1 , row0); \
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT8x1() \
|
|
ymm4 = _mm256_setzero_ps();
|
|
|
|
#define KERNEL8x1_SUB() \
|
|
ymm0 = _mm256_loadu_ps(AO); \
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO)); \
|
|
ymm4 += ymm0 * ymm2; \
|
|
BO += 1; \
|
|
AO += 8;
|
|
|
|
|
|
#define SAVE8x1(ALPHA) \
|
|
ymm0 = _mm256_set1_ps(ALPHA); \
|
|
ymm4 *= ymm0; \
|
|
ymm4 += _mm256_loadu_ps(CO1); \
|
|
_mm256_storeu_ps(CO1 , ymm4); \
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT4x1() \
|
|
row0 = _mm_setzero_ps(); \
|
|
|
|
#define KERNEL4x1_SUB() \
|
|
xmm0 = _mm_loadu_ps(AO); \
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO)); \
|
|
row0 += xmm0 * xmm2; \
|
|
BO += 1; \
|
|
AO += 4;
|
|
|
|
|
|
#define SAVE4x1(ALPHA) \
|
|
xmm0 = _mm_set1_ps(ALPHA); \
|
|
row0 *= xmm0; \
|
|
row0 += _mm_loadu_ps(CO1); \
|
|
_mm_storeu_ps(CO1 , row0); \
|
|
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT2x1() \
|
|
row0 = 0; row0b = 0;
|
|
|
|
#define KERNEL2x1_SUB() \
|
|
xmm0 = *(AO + 0); \
|
|
xmm1 = *(AO + 1); \
|
|
xmm2 = *(BO); \
|
|
row0 += xmm0 * xmm2; \
|
|
row0b += xmm1 * xmm2; \
|
|
BO += 1; \
|
|
AO += 2;
|
|
|
|
|
|
#define SAVE2x1(ALPHA) \
|
|
xmm0 = ALPHA; \
|
|
row0 *= xmm0; \
|
|
row0b *= xmm0; \
|
|
*(CO1 ) += row0; \
|
|
*(CO1 +1 ) += row0b; \
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
#define INIT1x1() \
|
|
row0 = 0;
|
|
|
|
#define KERNEL1x1_SUB() \
|
|
xmm0 = *(AO); \
|
|
xmm2 = *(BO); \
|
|
row0 += xmm0 * xmm2; \
|
|
BO += 1; \
|
|
AO += 1;
|
|
|
|
|
|
#define SAVE1x1(ALPHA) \
|
|
xmm0 = ALPHA; \
|
|
row0 *= xmm0; \
|
|
*(CO1 ) += row0; \
|
|
|
|
|
|
/*******************************************************************************************/
|
|
|
|
|
|
/*************************************************************************************
|
|
* GEMM Kernel
|
|
*************************************************************************************/
|
|
|
|
int __attribute__ ((noinline))
|
|
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc)
|
|
{
|
|
unsigned long M = m, N = n, K = k;
|
|
if (M == 0)
|
|
return 0;
|
|
if (N == 0)
|
|
return 0;
|
|
if (K == 0)
|
|
return 0;
|
|
|
|
|
|
while (N >= 4) {
|
|
float *CO1;
|
|
float *AO;
|
|
int i;
|
|
// L8_10
|
|
CO1 = C;
|
|
C += 4 * ldc;
|
|
|
|
AO = A;
|
|
|
|
i = m;
|
|
while (i >= 64) {
|
|
float *BO;
|
|
float *A1, *A2, *A3;
|
|
// L8_11
|
|
__m512 zmm0, zmm1, zmm2, zmm3, row0, zmm5, row1, zmm7, row2, row3, row0b, row1b, row2b, row3b, row0c, row1c, row2c, row3c, row0d, row1d, row2d, row3d;
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
A1 = AO + 16 * K;
|
|
A2 = A1 + 16 * K;
|
|
A3 = A2 + 16 * K;
|
|
|
|
INIT64x4()
|
|
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL64x4_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE64x4(alpha)
|
|
CO1 += 64;
|
|
AO += 48 * K;
|
|
|
|
i -= 64;
|
|
}
|
|
while (i >= 32) {
|
|
float *BO;
|
|
float *A1;
|
|
// L8_11
|
|
__m512 zmm0, zmm1, zmm2, zmm3, row0, row1, row2, row3, row0b, row1b, row2b, row3b;
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
A1 = AO + 16 * K;
|
|
|
|
INIT32x4()
|
|
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL32x4_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE32x4(alpha)
|
|
CO1 += 32;
|
|
AO += 16 * K;
|
|
|
|
i -= 32;
|
|
}
|
|
while (i >= 16) {
|
|
float *BO;
|
|
// L8_11
|
|
__m512 zmm0, zmm2, zmm3, row0, row1, row2, row3;
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT16x4()
|
|
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL16x4_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE16x4(alpha)
|
|
CO1 += 16;
|
|
|
|
i -= 16;
|
|
}
|
|
while (i >= 8) {
|
|
float *BO;
|
|
// L8_11
|
|
__m256 ymm0, ymm2, ymm3, ymm4, ymm6,ymm8,ymm10;
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT8x4()
|
|
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL8x4_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE8x4(alpha)
|
|
CO1 += 8;
|
|
|
|
i -= 8;
|
|
}
|
|
while (i >= 4) {
|
|
// L8_11
|
|
float *BO;
|
|
__m128 xmm0, xmm2, xmm3, row0, row1, row2, row3;
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT4x4()
|
|
// L8_16
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL4x4_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE4x4(alpha)
|
|
CO1 += 4;
|
|
|
|
i -= 4;
|
|
}
|
|
|
|
/**************************************************************************
|
|
* Rest of M
|
|
***************************************************************************/
|
|
|
|
while (i >= 2) {
|
|
float *BO;
|
|
float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b, row2, row2b, row3, row3b;
|
|
BO = B;
|
|
|
|
INIT2x4()
|
|
int kloop = K;
|
|
|
|
while (kloop > 0) {
|
|
KERNEL2x4_SUB()
|
|
kloop--;
|
|
}
|
|
SAVE2x4(alpha)
|
|
CO1 += 2;
|
|
i -= 2;
|
|
}
|
|
// L13_40
|
|
while (i >= 1) {
|
|
float *BO;
|
|
float xmm0, xmm2, xmm3, row0, row1, row2, row3;
|
|
int kloop = K;
|
|
BO = B;
|
|
INIT1x4()
|
|
|
|
while (kloop > 0) {
|
|
KERNEL1x4_SUB()
|
|
kloop--;
|
|
}
|
|
SAVE1x4(alpha)
|
|
CO1 += 1;
|
|
i -= 1;
|
|
}
|
|
|
|
B += K * 4;
|
|
N -= 4;
|
|
}
|
|
|
|
/**************************************************************************************************/
|
|
|
|
// L8_0
|
|
while (N >= 2) {
|
|
float *CO1;
|
|
float *AO;
|
|
int i;
|
|
// L8_10
|
|
CO1 = C;
|
|
C += 2 * ldc;
|
|
|
|
AO = A;
|
|
|
|
i = m;
|
|
while (i >= 16) {
|
|
float *BO;
|
|
|
|
// L8_11
|
|
__m512 zmm0, zmm2, zmm3, row0, row1;
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT16x2()
|
|
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL16x2_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE16x2(alpha)
|
|
CO1 += 16;
|
|
|
|
i -= 16;
|
|
}
|
|
while (i >= 8) {
|
|
float *BO;
|
|
__m256 ymm0, ymm2, ymm3, ymm4, ymm6;
|
|
// L8_11
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT8x2()
|
|
|
|
// L8_16
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL8x2_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE8x2(alpha)
|
|
CO1 += 8;
|
|
|
|
i-=8;
|
|
}
|
|
|
|
while (i >= 4) {
|
|
float *BO;
|
|
__m128 xmm0, xmm2, xmm3, row0, row1;
|
|
// L8_11
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT4x2()
|
|
|
|
// L8_16
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL4x2_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE4x2(alpha)
|
|
CO1 += 4;
|
|
|
|
i-=4;
|
|
}
|
|
|
|
/**************************************************************************
|
|
* Rest of M
|
|
***************************************************************************/
|
|
|
|
while (i >= 2) {
|
|
float *BO;
|
|
float xmm0, xmm1, xmm2, xmm3, row0, row0b, row1, row1b;
|
|
int kloop = K;
|
|
BO = B;
|
|
|
|
INIT2x2()
|
|
|
|
while (kloop > 0) {
|
|
KERNEL2x2_SUB()
|
|
kloop--;
|
|
}
|
|
SAVE2x2(alpha)
|
|
CO1 += 2;
|
|
i -= 2;
|
|
}
|
|
// L13_40
|
|
while (i >= 1) {
|
|
float *BO;
|
|
float xmm0, xmm2, xmm3, row0, row1;
|
|
int kloop = K;
|
|
BO = B;
|
|
|
|
INIT1x2()
|
|
|
|
while (kloop > 0) {
|
|
KERNEL1x2_SUB()
|
|
kloop--;
|
|
}
|
|
SAVE1x2(alpha)
|
|
CO1 += 1;
|
|
i -= 1;
|
|
}
|
|
|
|
B += K * 2;
|
|
N -= 2;
|
|
}
|
|
|
|
// L8_0
|
|
while (N >= 1) {
|
|
// L8_10
|
|
float *CO1;
|
|
float *AO;
|
|
int i;
|
|
|
|
CO1 = C;
|
|
C += ldc;
|
|
|
|
AO = A;
|
|
|
|
i = m;
|
|
while (i >= 16) {
|
|
float *BO;
|
|
__m512 zmm0, zmm2, row0;
|
|
// L8_11
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT16x1()
|
|
// L8_16
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL16x1_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE16x1(alpha)
|
|
CO1 += 16;
|
|
|
|
i-= 16;
|
|
}
|
|
while (i >= 8) {
|
|
float *BO;
|
|
__m256 ymm0, ymm2, ymm4;
|
|
// L8_11
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT8x1()
|
|
// L8_16
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL8x1_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE8x1(alpha)
|
|
CO1 += 8;
|
|
|
|
i-= 8;
|
|
}
|
|
while (i >= 4) {
|
|
float *BO;
|
|
__m128 xmm0, xmm2, row0;
|
|
// L8_11
|
|
BO = B;
|
|
int kloop = K;
|
|
|
|
INIT4x1()
|
|
// L8_16
|
|
while (kloop > 0) {
|
|
// L12_17
|
|
KERNEL4x1_SUB()
|
|
kloop--;
|
|
}
|
|
// L8_19
|
|
SAVE4x1(alpha)
|
|
CO1 += 4;
|
|
|
|
i-= 4;
|
|
}
|
|
|
|
/**************************************************************************
|
|
* Rest of M
|
|
***************************************************************************/
|
|
|
|
while (i >= 2) {
|
|
float *BO;
|
|
float xmm0, xmm1, xmm2, row0, row0b;
|
|
int kloop = K;
|
|
BO = B;
|
|
|
|
INIT2x1()
|
|
|
|
while (kloop > 0) {
|
|
KERNEL2x1_SUB()
|
|
kloop--;
|
|
}
|
|
SAVE2x1(alpha)
|
|
CO1 += 2;
|
|
i -= 2;
|
|
}
|
|
// L13_40
|
|
while (i >= 1) {
|
|
float *BO;
|
|
float xmm0, xmm2, row0;
|
|
int kloop = K;
|
|
|
|
BO = B;
|
|
INIT1x1()
|
|
|
|
|
|
while (kloop > 0) {
|
|
KERNEL1x1_SUB()
|
|
kloop--;
|
|
}
|
|
SAVE1x1(alpha)
|
|
CO1 += 1;
|
|
i -= 1;
|
|
}
|
|
|
|
B += K * 1;
|
|
N -= 1;
|
|
}
|
|
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* "Direct sgemm" code. This code operates directly on the inputs and outputs
|
|
* of the sgemm call, avoiding the copies, memory realignments and threading,
|
|
* and only supports alpha = 1 and beta = 0.
|
|
* This is a common case and provides value for relatively small matrixes.
|
|
* For larger matrixes the "regular" sgemm code is superior, there the cost of
|
|
* copying/shuffling the B matrix really pays off.
|
|
*/
|
|
|
|
|
|
|
|
#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps()
|
|
#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)]))
|
|
#define LOAD_B_512(N,M) __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)])
|
|
#define MATMUL_512(N,M) result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M)
|
|
#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M)
|
|
|
|
|
|
#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps()
|
|
#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)]))
|
|
#define LOAD_B_256(N,M) __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)])
|
|
#define MATMUL_256(N,M) result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M)
|
|
#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M)
|
|
|
|
#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps()
|
|
#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k + strideA * (i+M)]))
|
|
#define LOAD_B_128(N,M) __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)])
|
|
#define MATMUL_128(N,M) result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M)
|
|
#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M)
|
|
|
|
#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0;
|
|
#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)];
|
|
#define LOAD_B_SCALAR(N,M) float Bval##N = B[k * strideB + j + N];
|
|
#define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N;
|
|
#define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M;
|
|
|
|
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
|
|
{
|
|
int mnk = M * N * K;
|
|
/* large matrixes -> not performant */
|
|
if (mnk >= 28 * 512 * 512)
|
|
return 0;
|
|
|
|
/*
|
|
* if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
|
|
* and the regular sgemm copy/realignment of data pays off much quicker
|
|
*/
|
|
if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
|
|
return 0;
|
|
|
|
#ifdef SMP
|
|
/* if we can run multithreaded, the threading changes the based threshold */
|
|
if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
|
|
return 0;
|
|
#endif
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
|
|
void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
|
|
{
|
|
int i, j, k;
|
|
|
|
int m4 = M & ~3;
|
|
int m2 = M & ~1;
|
|
|
|
int n64 = N & ~63;
|
|
int n32 = N & ~31;
|
|
int n16 = N & ~15;
|
|
int n8 = N & ~7;
|
|
int n4 = N & ~3;
|
|
int n2 = N & ~1;
|
|
|
|
i = 0;
|
|
|
|
for (i = 0; i < m4; i+=4) {
|
|
|
|
for (j = 0; j < n64; j+= 64) {
|
|
k = 0;
|
|
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
|
|
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
|
|
DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
|
|
DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
|
|
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
BROADCAST_LOAD_A_512(x, 1);
|
|
BROADCAST_LOAD_A_512(x, 2);
|
|
BROADCAST_LOAD_A_512(x, 3);
|
|
|
|
LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x);
|
|
|
|
MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
|
|
MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
|
|
MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
|
|
MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
|
|
}
|
|
STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
|
|
STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
|
|
STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
|
|
STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
|
|
}
|
|
|
|
for (; j < n32; j+= 32) {
|
|
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
|
|
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
|
|
DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
|
|
DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
BROADCAST_LOAD_A_512(x, 1);
|
|
BROADCAST_LOAD_A_512(x, 2);
|
|
BROADCAST_LOAD_A_512(x, 3);
|
|
|
|
LOAD_B_512(0, x); LOAD_B_512(1, x);
|
|
|
|
MATMUL_512(0, 0); MATMUL_512(1, 0);
|
|
MATMUL_512(0, 1); MATMUL_512(1, 1);
|
|
MATMUL_512(0, 2); MATMUL_512(1, 2);
|
|
MATMUL_512(0, 3); MATMUL_512(1, 3);
|
|
}
|
|
STORE_512(0, 0); STORE_512(1, 0);
|
|
STORE_512(0, 1); STORE_512(1, 1);
|
|
STORE_512(0, 2); STORE_512(1, 2);
|
|
STORE_512(0, 3); STORE_512(1, 3);
|
|
}
|
|
|
|
for (; j < n16; j+= 16) {
|
|
DECLARE_RESULT_512(0, 0);
|
|
DECLARE_RESULT_512(0, 1);
|
|
DECLARE_RESULT_512(0, 2);
|
|
DECLARE_RESULT_512(0, 3);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
BROADCAST_LOAD_A_512(x, 1);
|
|
BROADCAST_LOAD_A_512(x, 2);
|
|
BROADCAST_LOAD_A_512(x, 3);
|
|
|
|
LOAD_B_512(0, x);
|
|
|
|
MATMUL_512(0, 0);
|
|
MATMUL_512(0, 1);
|
|
MATMUL_512(0, 2);
|
|
MATMUL_512(0, 3);
|
|
}
|
|
STORE_512(0, 0);
|
|
STORE_512(0, 1);
|
|
STORE_512(0, 2);
|
|
STORE_512(0, 3);
|
|
}
|
|
|
|
for (; j < n8; j+= 8) {
|
|
DECLARE_RESULT_256(0, 0);
|
|
DECLARE_RESULT_256(0, 1);
|
|
DECLARE_RESULT_256(0, 2);
|
|
DECLARE_RESULT_256(0, 3);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_256(x, 0);
|
|
BROADCAST_LOAD_A_256(x, 1);
|
|
BROADCAST_LOAD_A_256(x, 2);
|
|
BROADCAST_LOAD_A_256(x, 3);
|
|
|
|
LOAD_B_256(0, x);
|
|
|
|
MATMUL_256(0, 0);
|
|
MATMUL_256(0, 1);
|
|
MATMUL_256(0, 2);
|
|
MATMUL_256(0, 3);
|
|
}
|
|
STORE_256(0, 0);
|
|
STORE_256(0, 1);
|
|
STORE_256(0, 2);
|
|
STORE_256(0, 3);
|
|
}
|
|
|
|
for (; j < n4; j+= 4) {
|
|
DECLARE_RESULT_128(0, 0);
|
|
DECLARE_RESULT_128(0, 1);
|
|
DECLARE_RESULT_128(0, 2);
|
|
DECLARE_RESULT_128(0, 3);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_128(x, 0);
|
|
BROADCAST_LOAD_A_128(x, 1);
|
|
BROADCAST_LOAD_A_128(x, 2);
|
|
BROADCAST_LOAD_A_128(x, 3);
|
|
|
|
LOAD_B_128(0, x);
|
|
|
|
MATMUL_128(0, 0);
|
|
MATMUL_128(0, 1);
|
|
MATMUL_128(0, 2);
|
|
MATMUL_128(0, 3);
|
|
}
|
|
STORE_128(0, 0);
|
|
STORE_128(0, 1);
|
|
STORE_128(0, 2);
|
|
STORE_128(0, 3);
|
|
}
|
|
|
|
for (; j < n2; j+= 2) {
|
|
DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0);
|
|
DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1);
|
|
DECLARE_RESULT_SCALAR(0, 2); DECLARE_RESULT_SCALAR(1, 2);
|
|
DECLARE_RESULT_SCALAR(0, 3); DECLARE_RESULT_SCALAR(1, 3);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_SCALAR(x, 0);
|
|
BROADCAST_LOAD_A_SCALAR(x, 1);
|
|
BROADCAST_LOAD_A_SCALAR(x, 2);
|
|
BROADCAST_LOAD_A_SCALAR(x, 3);
|
|
|
|
LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x);
|
|
|
|
MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0);
|
|
MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1);
|
|
MATMUL_SCALAR(0, 2); MATMUL_SCALAR(1, 2);
|
|
MATMUL_SCALAR(0, 3); MATMUL_SCALAR(1, 3);
|
|
}
|
|
STORE_SCALAR(0, 0); STORE_SCALAR(1, 0);
|
|
STORE_SCALAR(0, 1); STORE_SCALAR(1, 1);
|
|
STORE_SCALAR(0, 2); STORE_SCALAR(1, 2);
|
|
STORE_SCALAR(0, 3); STORE_SCALAR(1, 3);
|
|
}
|
|
|
|
for (; j < N; j++) {
|
|
DECLARE_RESULT_SCALAR(0, 0)
|
|
DECLARE_RESULT_SCALAR(0, 1)
|
|
DECLARE_RESULT_SCALAR(0, 2)
|
|
DECLARE_RESULT_SCALAR(0, 3)
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_SCALAR(0, 0);
|
|
BROADCAST_LOAD_A_SCALAR(0, 1);
|
|
BROADCAST_LOAD_A_SCALAR(0, 2);
|
|
BROADCAST_LOAD_A_SCALAR(0, 3);
|
|
|
|
LOAD_B_SCALAR(0, 0);
|
|
|
|
MATMUL_SCALAR(0, 0);
|
|
MATMUL_SCALAR(0, 1);
|
|
MATMUL_SCALAR(0, 2);
|
|
MATMUL_SCALAR(0, 3);
|
|
}
|
|
STORE_SCALAR(0, 0);
|
|
STORE_SCALAR(0, 1);
|
|
STORE_SCALAR(0, 2);
|
|
STORE_SCALAR(0, 3);
|
|
}
|
|
}
|
|
|
|
for (; i < m2; i+=2) {
|
|
j = 0;
|
|
|
|
for (; j < n64; j+= 64) {
|
|
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
|
|
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
|
|
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
BROADCAST_LOAD_A_512(x, 1);
|
|
|
|
LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x);
|
|
|
|
MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
|
|
MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
|
|
}
|
|
STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
|
|
STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
|
|
}
|
|
|
|
for (; j < n32; j+= 32) {
|
|
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
|
|
DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
BROADCAST_LOAD_A_512(x, 1);
|
|
|
|
LOAD_B_512(0, x); LOAD_B_512(1, x);
|
|
|
|
MATMUL_512(0, 0); MATMUL_512(1, 0);
|
|
MATMUL_512(0, 1); MATMUL_512(1, 1);
|
|
}
|
|
STORE_512(0, 0); STORE_512(1, 0);
|
|
STORE_512(0, 1); STORE_512(1, 1);
|
|
}
|
|
|
|
|
|
for (; j < n16; j+= 16) {
|
|
DECLARE_RESULT_512(0, 0);
|
|
DECLARE_RESULT_512(0, 1);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
BROADCAST_LOAD_A_512(x, 1);
|
|
|
|
LOAD_B_512(0, x);
|
|
|
|
MATMUL_512(0, 0);
|
|
MATMUL_512(0, 1);
|
|
}
|
|
STORE_512(0, 0);
|
|
STORE_512(0, 1);
|
|
}
|
|
|
|
for (; j < n8; j+= 8) {
|
|
DECLARE_RESULT_256(0, 0);
|
|
DECLARE_RESULT_256(0, 1);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_256(x, 0);
|
|
BROADCAST_LOAD_A_256(x, 1);
|
|
|
|
LOAD_B_256(0, x);
|
|
|
|
MATMUL_256(0, 0);
|
|
MATMUL_256(0, 1);
|
|
}
|
|
STORE_256(0, 0);
|
|
STORE_256(0, 1);
|
|
}
|
|
|
|
for (; j < n4; j+= 4) {
|
|
DECLARE_RESULT_128(0, 0);
|
|
DECLARE_RESULT_128(0, 1);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_128(x, 0);
|
|
BROADCAST_LOAD_A_128(x, 1);
|
|
|
|
LOAD_B_128(0, x);
|
|
|
|
MATMUL_128(0, 0);
|
|
MATMUL_128(0, 1);
|
|
}
|
|
STORE_128(0, 0);
|
|
STORE_128(0, 1);
|
|
}
|
|
for (; j < n2; j+= 2) {
|
|
DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0);
|
|
DECLARE_RESULT_SCALAR(0, 1); DECLARE_RESULT_SCALAR(1, 1);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_SCALAR(x, 0);
|
|
BROADCAST_LOAD_A_SCALAR(x, 1);
|
|
|
|
LOAD_B_SCALAR(0, x); LOAD_B_SCALAR(1, x);
|
|
|
|
MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0);
|
|
MATMUL_SCALAR(0, 1); MATMUL_SCALAR(1, 1);
|
|
}
|
|
STORE_SCALAR(0, 0); STORE_SCALAR(1, 0);
|
|
STORE_SCALAR(0, 1); STORE_SCALAR(1, 1);
|
|
}
|
|
|
|
for (; j < N; j++) {
|
|
DECLARE_RESULT_SCALAR(0, 0);
|
|
DECLARE_RESULT_SCALAR(0, 1);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_SCALAR(0, 0);
|
|
BROADCAST_LOAD_A_SCALAR(0, 1);
|
|
|
|
LOAD_B_SCALAR(0, 0);
|
|
|
|
MATMUL_SCALAR(0, 0);
|
|
MATMUL_SCALAR(0, 1);
|
|
}
|
|
STORE_SCALAR(0, 0);
|
|
STORE_SCALAR(0, 1);
|
|
}
|
|
}
|
|
|
|
for (; i < M; i+=1) {
|
|
j = 0;
|
|
for (; j < n64; j+= 64) {
|
|
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
LOAD_B_512(0, x); LOAD_B_512(1, x); LOAD_B_512(2, x); LOAD_B_512(3, x);
|
|
MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
|
|
}
|
|
STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
|
|
}
|
|
for (; j < n32; j+= 32) {
|
|
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
LOAD_B_512(0, x); LOAD_B_512(1, x);
|
|
MATMUL_512(0, 0); MATMUL_512(1, 0);
|
|
}
|
|
STORE_512(0, 0); STORE_512(1, 0);
|
|
}
|
|
|
|
|
|
for (; j < n16; j+= 16) {
|
|
DECLARE_RESULT_512(0, 0);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_512(x, 0);
|
|
|
|
LOAD_B_512(0, x);
|
|
|
|
MATMUL_512(0, 0);
|
|
}
|
|
STORE_512(0, 0);
|
|
}
|
|
|
|
for (; j < n8; j+= 8) {
|
|
DECLARE_RESULT_256(0, 0);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_256(x, 0);
|
|
LOAD_B_256(0, x);
|
|
MATMUL_256(0, 0);
|
|
}
|
|
STORE_256(0, 0);
|
|
}
|
|
|
|
for (; j < n4; j+= 4) {
|
|
DECLARE_RESULT_128(0, 0);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_128(x, 0);
|
|
LOAD_B_128(0, x);
|
|
MATMUL_128(0, 0);
|
|
}
|
|
STORE_128(0, 0);
|
|
}
|
|
|
|
for (; j < n2; j+= 2) {
|
|
DECLARE_RESULT_SCALAR(0, 0); DECLARE_RESULT_SCALAR(1, 0);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_SCALAR(x, 0);
|
|
LOAD_B_SCALAR(0, 0); LOAD_B_SCALAR(1, 0);
|
|
MATMUL_SCALAR(0, 0); MATMUL_SCALAR(1, 0);
|
|
}
|
|
STORE_SCALAR(0, 0); STORE_SCALAR(1, 0);
|
|
}
|
|
|
|
for (; j < N; j++) {
|
|
DECLARE_RESULT_SCALAR(0, 0);
|
|
|
|
for (k = 0; k < K; k++) {
|
|
BROADCAST_LOAD_A_SCALAR(0, 0);
|
|
LOAD_B_SCALAR(0, 0);
|
|
MATMUL_SCALAR(0, 0);
|
|
}
|
|
STORE_SCALAR(0, 0);
|
|
}
|
|
}
|
|
} |