enable the SGEMM/SKX C based kernel
In QA the final bug was found so now the sklyakex sgemm C based kernel can be activated....
This commit is contained in:
parent
2263d3906c
commit
55b244ca0d
|
@ -1,6 +1,11 @@
|
|||
include $(KERNELDIR)/KERNEL.HASWELL
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c
|
||||
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
|
||||
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c
|
||||
|
||||
|
@ -9,5 +14,5 @@ DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
|
|||
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
|
||||
DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c
|
||||
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
SGEMM_BETA = sgemm_beta_skylakex.c
|
||||
DGEMM_BETA = dgemm_beta_skylakex.c
|
||||
|
|
|
@ -60,8 +60,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
|
||||
if (beta == ZERO){
|
||||
__m512 z_zero;
|
||||
__m256 y_zero;
|
||||
|
||||
z_zero = _mm512_setzero_ps();
|
||||
y_zero = _mm256_setzero_ps();
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
|
@ -71,14 +73,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
|
||||
while (i > 32) {
|
||||
_mm512_storeu_ps(c_offset1, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 8, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 16, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 24 , z_zero);
|
||||
c_offset1 += 32;
|
||||
i -= 32;
|
||||
}
|
||||
while (i > 8) {
|
||||
_mm512_storeu_ps(c_offset1, z_zero);
|
||||
_mm256_storeu_ps(c_offset1, y_zero);
|
||||
c_offset1 += 8;
|
||||
i -= 8;
|
||||
}
|
||||
|
|
|
@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
|
||||
#define INIT32x8() \
|
||||
row0 = _mm512_setzero_ps(); \
|
||||
row1 = _mm512_setzero_ps(); \
|
||||
row2 = _mm512_setzero_ps(); \
|
||||
row3 = _mm512_setzero_ps(); \
|
||||
row4 = _mm512_setzero_ps(); \
|
||||
row5 = _mm512_setzero_ps(); \
|
||||
row6 = _mm512_setzero_ps(); \
|
||||
row0b = _mm512_setzero_ps(); \
|
||||
row1b = _mm512_setzero_ps(); \
|
||||
row2b = _mm512_setzero_ps(); \
|
||||
row3b = _mm512_setzero_ps(); \
|
||||
row4b = _mm512_setzero_ps(); \
|
||||
row5b = _mm512_setzero_ps(); \
|
||||
row6b = _mm512_setzero_ps(); \
|
||||
row7b = _mm512_setzero_ps(); \
|
||||
|
||||
#define KERNEL32x8_SUB() \
|
||||
zmm0 = _mm512_loadu_ps(AO); \
|
||||
zmm0b = _mm512_loadu_ps(AOb); \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += zmm0 * zmm2; \
|
||||
row1 += zmm0 * zmm3; \
|
||||
row0b += zmm0b * zmm2; \
|
||||
row1b += zmm0b * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += zmm0 * zmm2; \
|
||||
row3 += zmm0 * zmm3; \
|
||||
row2b += zmm0b * zmm2; \
|
||||
row3b += zmm0b * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += zmm0 * zmm2; \
|
||||
row5 += zmm0 * zmm3; \
|
||||
row4b += zmm0b * zmm2; \
|
||||
row5b += zmm0b * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += zmm0 * zmm2; \
|
||||
row7 += zmm0 * zmm3; \
|
||||
row6b += zmm0b * zmm2; \
|
||||
row7b += zmm0b * zmm3; \
|
||||
BO += 8; \
|
||||
AO += 16; \
|
||||
AOb += 16;
|
||||
|
||||
|
||||
#define SAVE32x8(ALPHA) \
|
||||
zmm0 = _mm512_set1_ps(ALPHA); \
|
||||
row0 *= zmm0; \
|
||||
row1 *= zmm0; \
|
||||
row2 *= zmm0; \
|
||||
row3 *= zmm0; \
|
||||
row4 *= zmm0; \
|
||||
row5 *= zmm0; \
|
||||
row6 *= zmm0; \
|
||||
row7 *= zmm0; \
|
||||
row0b *= zmm0; \
|
||||
row1b *= zmm0; \
|
||||
row2b *= zmm0; \
|
||||
row3b *= zmm0; \
|
||||
row4b *= zmm0; \
|
||||
row5b *= zmm0; \
|
||||
row6b *= zmm0; \
|
||||
row7b *= zmm0; \
|
||||
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm512_storeu_ps(CO1 + 7 * ldc, row7); \
|
||||
row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \
|
||||
row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \
|
||||
row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \
|
||||
row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \
|
||||
row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \
|
||||
row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \
|
||||
row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \
|
||||
row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \
|
||||
_mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \
|
||||
_mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \
|
||||
_mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \
|
||||
_mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \
|
||||
_mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \
|
||||
_mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \
|
||||
_mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \
|
||||
_mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \
|
||||
|
||||
|
||||
#define INIT16x8() \
|
||||
row0 = _mm512_setzero_ps(); \
|
||||
row1 = _mm512_setzero_ps(); \
|
||||
row2 = _mm512_setzero_ps(); \
|
||||
row3 = _mm512_setzero_ps(); \
|
||||
row4 = _mm512_setzero_ps(); \
|
||||
row5 = _mm512_setzero_ps(); \
|
||||
row6 = _mm512_setzero_ps(); \
|
||||
row7 = _mm512_setzero_ps(); \
|
||||
|
||||
#define KERNEL16x8_SUB() \
|
||||
zmm0 = _mm512_loadu_ps(AO); \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += zmm0 * zmm2; \
|
||||
row1 += zmm0 * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += zmm0 * zmm2; \
|
||||
row3 += zmm0 * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += zmm0 * zmm2; \
|
||||
row5 += zmm0 * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += zmm0 * zmm2; \
|
||||
row7 += zmm0 * zmm3; \
|
||||
BO += 8; \
|
||||
AO += 16;
|
||||
|
||||
|
||||
#define SAVE16x8(ALPHA) \
|
||||
zmm0 = _mm512_set1_ps(ALPHA); \
|
||||
row0 *= zmm0; \
|
||||
row1 *= zmm0; \
|
||||
row2 *= zmm0; \
|
||||
row3 *= zmm0; \
|
||||
row4 *= zmm0; \
|
||||
row5 *= zmm0; \
|
||||
row6 *= zmm0; \
|
||||
row7 *= zmm0; \
|
||||
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm512_storeu_ps(CO1 + 7 * ldc, row7);
|
||||
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT8x8() \
|
||||
row0 = _mm256_setzero_ps(); \
|
||||
row1 = _mm256_setzero_ps(); \
|
||||
row2 = _mm256_setzero_ps(); \
|
||||
row3 = _mm256_setzero_ps(); \
|
||||
row4 = _mm256_setzero_ps(); \
|
||||
row5 = _mm256_setzero_ps(); \
|
||||
row6 = _mm256_setzero_ps(); \
|
||||
row7 = _mm256_setzero_ps(); \
|
||||
|
||||
#define KERNEL8x8_SUB() \
|
||||
ymm0 = _mm256_loadu_ps(AO); \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += ymm0 * ymm2; \
|
||||
row1 += ymm0 * ymm3; \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += ymm0 * ymm2; \
|
||||
row3 += ymm0 * ymm3; \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += ymm0 * ymm2; \
|
||||
row5 += ymm0 * ymm3; \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += ymm0 * ymm2; \
|
||||
row7 += ymm0 * ymm3; \
|
||||
BO += 8; \
|
||||
AO += 8;
|
||||
|
||||
|
||||
#define SAVE8x8(ALPHA) \
|
||||
ymm0 = _mm256_set1_ps(ALPHA); \
|
||||
row0 *= ymm0; \
|
||||
row1 *= ymm0; \
|
||||
row2 *= ymm0; \
|
||||
row3 *= ymm0; \
|
||||
row4 *= ymm0; \
|
||||
row5 *= ymm0; \
|
||||
row6 *= ymm0; \
|
||||
row7 *= ymm0; \
|
||||
row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm256_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm256_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm256_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm256_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm256_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm256_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm256_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm256_storeu_ps(CO1 + 7 * ldc, row7); \
|
||||
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT4x8() \
|
||||
row0 = _mm_setzero_ps(); \
|
||||
row1 = _mm_setzero_ps(); \
|
||||
row2 = _mm_setzero_ps(); \
|
||||
row3 = _mm_setzero_ps(); \
|
||||
row4 = _mm_setzero_ps(); \
|
||||
row5 = _mm_setzero_ps(); \
|
||||
row6 = _mm_setzero_ps(); \
|
||||
row7 = _mm_setzero_ps(); \
|
||||
|
||||
|
||||
#define KERNEL4x8_SUB() \
|
||||
xmm0 = _mm_loadu_ps(AO); \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += xmm0 * xmm2; \
|
||||
row1 += xmm0 * xmm3; \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += xmm0 * xmm2; \
|
||||
row3 += xmm0 * xmm3; \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += xmm0 * xmm2; \
|
||||
row5 += xmm0 * xmm3; \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += xmm0 * xmm2; \
|
||||
row7 += xmm0 * xmm3; \
|
||||
BO += 8; \
|
||||
AO += 4;
|
||||
|
||||
|
||||
#define SAVE4x8(ALPHA) \
|
||||
xmm0 = _mm_set1_ps(ALPHA); \
|
||||
row0 *= xmm0; \
|
||||
row1 *= xmm0; \
|
||||
row2 *= xmm0; \
|
||||
row3 *= xmm0; \
|
||||
row4 *= xmm0; \
|
||||
row5 *= xmm0; \
|
||||
row6 *= xmm0; \
|
||||
row7 *= xmm0; \
|
||||
row0 += _mm_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm_storeu_ps(CO1 + 7 * ldc, row7); \
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT2x8() \
|
||||
row0a = row0b = 0; \
|
||||
row1a = row1b = 0; \
|
||||
row2a = row2b = 0; \
|
||||
row3a = row3b = 0; \
|
||||
row4a = row4b = 0; \
|
||||
row5a = row5b = 0; \
|
||||
row6a = row6b = 0; \
|
||||
row7a = row7b = 0; \
|
||||
|
||||
#define KERNEL2x8_SUB() \
|
||||
xmm0 = *(AO); \
|
||||
xmm1 = *(AO + 1); \
|
||||
xmm2 = *(BO + 0); \
|
||||
xmm3 = *(BO + 1); \
|
||||
row0a += xmm0 * xmm2; \
|
||||
row0b += xmm1 * xmm2; \
|
||||
row1a += xmm0 * xmm3; \
|
||||
row1b += xmm1 * xmm3; \
|
||||
xmm2 = *(BO + 2); \
|
||||
xmm3 = *(BO + 3); \
|
||||
row2a += xmm0 * xmm2; \
|
||||
row2b += xmm1 * xmm2; \
|
||||
row3a += xmm0 * xmm3; \
|
||||
row3b += xmm1 * xmm3; \
|
||||
xmm2 = *(BO + 4); \
|
||||
xmm3 = *(BO + 5); \
|
||||
row4a += xmm0 * xmm2; \
|
||||
row4b += xmm1 * xmm2; \
|
||||
row5a += xmm0 * xmm3; \
|
||||
row5b += xmm1 * xmm3; \
|
||||
xmm2 = *(BO + 6); \
|
||||
xmm3 = *(BO + 7); \
|
||||
row6a += xmm0 * xmm2; \
|
||||
row6b += xmm1 * xmm2; \
|
||||
row7a += xmm0 * xmm3; \
|
||||
row7b += xmm1 * xmm3; \
|
||||
BO += 8; \
|
||||
AO += 2;
|
||||
|
||||
|
||||
#define SAVE2x8(ALPHA) \
|
||||
xmm0 = ALPHA; \
|
||||
row0a *= xmm0; \
|
||||
row0b *= xmm0; \
|
||||
row1a *= xmm0; \
|
||||
row1b *= xmm0; \
|
||||
row2a *= xmm0; \
|
||||
row2b *= xmm0; \
|
||||
row3a *= xmm0; \
|
||||
row3b *= xmm0; \
|
||||
row4a *= xmm0; \
|
||||
row4b *= xmm0; \
|
||||
row5a *= xmm0; \
|
||||
row5b *= xmm0; \
|
||||
row6a *= xmm0; \
|
||||
row6b *= xmm0; \
|
||||
row7a *= xmm0; \
|
||||
row7b *= xmm0; \
|
||||
*(CO1 + 0 * ldc + 0) += row0a; \
|
||||
*(CO1 + 0 * ldc + 1) += row0b; \
|
||||
*(CO1 + 1 * ldc + 0) += row1a; \
|
||||
*(CO1 + 1 * ldc + 1) += row1b; \
|
||||
*(CO1 + 2 * ldc + 0) += row2a; \
|
||||
*(CO1 + 2 * ldc + 1) += row2b; \
|
||||
*(CO1 + 3 * ldc + 0) += row3a; \
|
||||
*(CO1 + 3 * ldc + 1) += row3b; \
|
||||
*(CO1 + 4 * ldc + 0) += row4a; \
|
||||
*(CO1 + 4 * ldc + 1) += row4b; \
|
||||
*(CO1 + 5 * ldc + 0) += row5a; \
|
||||
*(CO1 + 5 * ldc + 1) += row5b; \
|
||||
*(CO1 + 6 * ldc + 0) += row6a; \
|
||||
*(CO1 + 6 * ldc + 1) += row6b; \
|
||||
*(CO1 + 7 * ldc + 0) += row7a; \
|
||||
*(CO1 + 7 * ldc + 1) += row7b; \
|
||||
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT1x8() \
|
||||
row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0;
|
||||
|
||||
#define KERNEL1x8_SUB() \
|
||||
xmm0 = *(AO ); \
|
||||
xmm2 = *(BO + 0); \
|
||||
xmm3 = *(BO + 1); \
|
||||
row0 += xmm0 * xmm2; \
|
||||
row1 += xmm0 * xmm3; \
|
||||
xmm2 = *(BO + 2); \
|
||||
xmm3 = *(BO + 3); \
|
||||
row2 += xmm0 * xmm2; \
|
||||
row3 += xmm0 * xmm3; \
|
||||
xmm2 = *(BO + 4); \
|
||||
xmm3 = *(BO + 5); \
|
||||
row4 += xmm0 * xmm2; \
|
||||
row5 += xmm0 * xmm3; \
|
||||
xmm2 = *(BO + 6); \
|
||||
xmm3 = *(BO + 7); \
|
||||
row6 += xmm0 * xmm2; \
|
||||
row7 += xmm0 * xmm3; \
|
||||
BO += 8; \
|
||||
AO += 1;
|
||||
|
||||
|
||||
#define SAVE1x8(ALPHA) \
|
||||
xmm0 = ALPHA; \
|
||||
row0 *= xmm0; \
|
||||
row1 *= xmm0; \
|
||||
row2 *= xmm0; \
|
||||
row3 *= xmm0; \
|
||||
row4 *= xmm0; \
|
||||
row5 *= xmm0; \
|
||||
row6 *= xmm0; \
|
||||
row7 *= xmm0; \
|
||||
*(CO1 + 0 * ldc) += row0; \
|
||||
*(CO1 + 1 * ldc) += row1; \
|
||||
*(CO1 + 2 * ldc) += row2; \
|
||||
*(CO1 + 3 * ldc) += row3; \
|
||||
*(CO1 + 4 * ldc) += row4; \
|
||||
*(CO1 + 5 * ldc) += row5; \
|
||||
*(CO1 + 6 * ldc) += row6; \
|
||||
*(CO1 + 7 * ldc) += row7; \
|
||||
|
||||
|
||||
|
||||
|
@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
|
|||
return 0;
|
||||
|
||||
|
||||
|
||||
// L8_0
|
||||
while (N >= 8 && 0) {
|
||||
float *CO1;
|
||||
float *AO;
|
||||
int i;
|
||||
// L8_10
|
||||
CO1 = C;
|
||||
C += 8 * ldc;
|
||||
|
||||
AO = A;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i >= 32 && 0) {
|
||||
float *BO, *AOb;
|
||||
// L8_11
|
||||
__m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
AOb = AO + 16 * K;
|
||||
|
||||
INIT32x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
// L12_17
|
||||
KERNEL32x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
// L8_19
|
||||
SAVE32x8(alpha)
|
||||
CO1 += 32;
|
||||
AO += 16 * K;
|
||||
|
||||
i -= 32;
|
||||
}
|
||||
while (i >= 16) {
|
||||
float *BO;
|
||||
// L8_11
|
||||
__m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
|
||||
INIT16x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
KERNEL16x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
SAVE16x8(alpha)
|
||||
CO1 += 16;
|
||||
|
||||
i -= 16;
|
||||
}
|
||||
while (i >= 8) {
|
||||
float *BO;
|
||||
// L8_11
|
||||
__m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
|
||||
INIT8x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
// L12_17
|
||||
KERNEL8x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
// L8_19
|
||||
SAVE8x8(alpha)
|
||||
CO1 += 8;
|
||||
|
||||
i -= 8;
|
||||
}
|
||||
while (i >= 4) {
|
||||
// L8_11
|
||||
float *BO;
|
||||
__m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
|
||||
INIT4x8()
|
||||
// L8_16
|
||||
while (kloop > 0) {
|
||||
// L12_17
|
||||
KERNEL4x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
// L8_19
|
||||
SAVE4x8(alpha)
|
||||
CO1 += 4;
|
||||
|
||||
i -= 4;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Rest of M
|
||||
***************************************************************************/
|
||||
|
||||
while (i >= 2) {
|
||||
float *BO;
|
||||
float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b;
|
||||
BO = B;
|
||||
|
||||
INIT2x8()
|
||||
int kloop = K;
|
||||
|
||||
while (kloop > 0) {
|
||||
KERNEL2x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
SAVE2x8(alpha)
|
||||
CO1 += 2;
|
||||
i -= 2;
|
||||
}
|
||||
// L13_40
|
||||
while (i >= 1) {
|
||||
float *BO;
|
||||
float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
int kloop = K;
|
||||
BO = B;
|
||||
INIT1x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
KERNEL1x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
SAVE1x8(alpha)
|
||||
CO1 += 1;
|
||||
i -= 1;
|
||||
}
|
||||
|
||||
B += K * 8;
|
||||
N -= 8;
|
||||
}
|
||||
|
||||
while (N >= 4) {
|
||||
float *CO1;
|
||||
float *AO;
|
||||
|
|
Loading…
Reference in New Issue