Merge pull request #1919 from fenrus75/haswelltuning
(sgemm) Apply some of the SKYLAKEX optimizations also to HASWELL
This commit is contained in:
commit
e8ca5a59a9
|
@ -5,6 +5,27 @@ endif
|
|||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
AVX2OPT =
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
|
||||
AVX2OPT = -mavx2
|
||||
endif
|
||||
endif
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
|
||||
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
|
||||
AVX2OPT = -mavx2
|
||||
endif
|
||||
endif
|
||||
ifdef NO_AVX2
|
||||
AVX2OPT=
|
||||
endif
|
||||
|
||||
ifdef TARGET_CORE
|
||||
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
||||
|
@ -16,8 +37,10 @@ ifeq ($(TARGET_CORE), SKYLAKEX)
|
|||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), HASWELL)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||
else
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
endif
|
||||
BUILD_KERNEL = 1
|
||||
KDIR =
|
||||
|
|
|
@ -33,9 +33,10 @@ ZAXPYKERNEL = zaxpy.c
|
|||
|
||||
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||
SGEMM_BETA = sgemm_beta_skylakex.c
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
@ -61,30 +61,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
c_offset = c;
|
||||
|
||||
if (beta == ZERO){
|
||||
__m512 z_zero;
|
||||
__m256 y_zero;
|
||||
|
||||
z_zero = _mm512_setzero_ps();
|
||||
y_zero = _mm256_setzero_ps();
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
|
||||
i = m;
|
||||
|
||||
#ifdef __AVX2__
|
||||
while (i >= 32) {
|
||||
#ifdef __AVX512CD__
|
||||
__m512 z_zero = _mm512_setzero_ps();
|
||||
_mm512_storeu_ps(c_offset1, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 16, z_zero);
|
||||
#else
|
||||
__m256 y_zero = _mm256_setzero_ps();
|
||||
_mm256_storeu_ps(c_offset1, y_zero);
|
||||
_mm256_storeu_ps(c_offset1 + 8, y_zero);
|
||||
_mm256_storeu_ps(c_offset1 + 16, y_zero);
|
||||
_mm256_storeu_ps(c_offset1 + 24, y_zero);
|
||||
#endif
|
||||
c_offset1 += 32;
|
||||
i -= 32;
|
||||
}
|
||||
while (i >= 8) {
|
||||
__m256 y_zero = _mm256_setzero_ps();
|
||||
_mm256_storeu_ps(c_offset1, y_zero);
|
||||
c_offset1 += 8;
|
||||
i -= 8;
|
||||
}
|
||||
|
||||
#endif
|
||||
while (i > 0) {
|
||||
*c_offset1 = ZERO;
|
||||
c_offset1 ++;
|
||||
|
|
|
@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
|
|||
FLOAT *b_offset;
|
||||
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
|
||||
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
|
||||
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp9, ctemp13;
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
|
Loading…
Reference in New Issue