Merge pull request #1919 from fenrus75/haswelltuning
(sgemm) Apply some of the SKYLAKEX optimizations also to HASWELL
This commit is contained in:
commit
e8ca5a59a9
|
@ -5,6 +5,27 @@ endif
|
||||||
TOPDIR = ..
|
TOPDIR = ..
|
||||||
include $(TOPDIR)/Makefile.system
|
include $(TOPDIR)/Makefile.system
|
||||||
|
|
||||||
|
AVX2OPT =
|
||||||
|
ifeq ($(C_COMPILER), GCC)
|
||||||
|
# AVX2 support was added in 4.7.0
|
||||||
|
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||||
|
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||||
|
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
|
||||||
|
AVX2OPT = -mavx2
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
|
||||||
|
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||||
|
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
|
||||||
|
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
|
||||||
|
AVX2OPT = -mavx2
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifdef NO_AVX2
|
||||||
|
AVX2OPT=
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef TARGET_CORE
|
ifdef TARGET_CORE
|
||||||
ifeq ($(TARGET_CORE), SKYLAKEX)
|
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
||||||
|
@ -16,8 +37,10 @@ ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
else ifeq ($(TARGET_CORE), HASWELL)
|
||||||
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||||
else
|
else
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||||
endif
|
endif
|
||||||
BUILD_KERNEL = 1
|
BUILD_KERNEL = 1
|
||||||
KDIR =
|
KDIR =
|
||||||
|
|
|
@ -33,9 +33,10 @@ ZAXPYKERNEL = zaxpy.c
|
||||||
|
|
||||||
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
|
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||||
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||||
|
SGEMM_BETA = sgemm_beta_skylakex.c
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
|
@ -61,30 +61,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
||||||
c_offset = c;
|
c_offset = c;
|
||||||
|
|
||||||
if (beta == ZERO){
|
if (beta == ZERO){
|
||||||
__m512 z_zero;
|
|
||||||
__m256 y_zero;
|
|
||||||
|
|
||||||
z_zero = _mm512_setzero_ps();
|
|
||||||
y_zero = _mm256_setzero_ps();
|
|
||||||
j = n;
|
j = n;
|
||||||
do {
|
do {
|
||||||
c_offset1 = c_offset;
|
c_offset1 = c_offset;
|
||||||
c_offset += ldc;
|
c_offset += ldc;
|
||||||
|
|
||||||
i = m;
|
i = m;
|
||||||
|
#ifdef __AVX2__
|
||||||
while (i >= 32) {
|
while (i >= 32) {
|
||||||
|
#ifdef __AVX512CD__
|
||||||
|
__m512 z_zero = _mm512_setzero_ps();
|
||||||
_mm512_storeu_ps(c_offset1, z_zero);
|
_mm512_storeu_ps(c_offset1, z_zero);
|
||||||
_mm512_storeu_ps(c_offset1 + 16, z_zero);
|
_mm512_storeu_ps(c_offset1 + 16, z_zero);
|
||||||
|
#else
|
||||||
|
__m256 y_zero = _mm256_setzero_ps();
|
||||||
|
_mm256_storeu_ps(c_offset1, y_zero);
|
||||||
|
_mm256_storeu_ps(c_offset1 + 8, y_zero);
|
||||||
|
_mm256_storeu_ps(c_offset1 + 16, y_zero);
|
||||||
|
_mm256_storeu_ps(c_offset1 + 24, y_zero);
|
||||||
|
#endif
|
||||||
c_offset1 += 32;
|
c_offset1 += 32;
|
||||||
i -= 32;
|
i -= 32;
|
||||||
}
|
}
|
||||||
while (i >= 8) {
|
while (i >= 8) {
|
||||||
|
__m256 y_zero = _mm256_setzero_ps();
|
||||||
_mm256_storeu_ps(c_offset1, y_zero);
|
_mm256_storeu_ps(c_offset1, y_zero);
|
||||||
c_offset1 += 8;
|
c_offset1 += 8;
|
||||||
i -= 8;
|
i -= 8;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
while (i > 0) {
|
while (i > 0) {
|
||||||
*c_offset1 = ZERO;
|
*c_offset1 = ZERO;
|
||||||
c_offset1 ++;
|
c_offset1 ++;
|
||||||
|
|
|
@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
|
||||||
FLOAT *b_offset;
|
FLOAT *b_offset;
|
||||||
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
|
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
|
||||||
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
|
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
|
||||||
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
|
FLOAT ctemp9, ctemp13;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
|
||||||
|
|
||||||
a_offset = a;
|
a_offset = a;
|
||||||
b_offset = b;
|
b_offset = b;
|
||||||
|
|
Loading…
Reference in New Issue