Merge pull request #1919 from fenrus75/haswelltuning

(sgemm) Apply some of the SKYLAKEX optimizations also to HASWELL
This commit is contained in:
Martin Kroeker 2018-12-16 20:11:05 +01:00 committed by GitHub
commit e8ca5a59a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 10 deletions

View File

@ -5,6 +5,27 @@ endif
TOPDIR = ..
include $(TOPDIR)/Makefile.system
AVX2OPT =
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
AVX2OPT = -mavx2
endif
endif
ifeq ($(C_COMPILER), CLANG)
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
AVX2OPT = -mavx2
endif
endif
ifdef NO_AVX2
AVX2OPT=
endif
ifdef TARGET_CORE
ifeq ($(TARGET_CORE), SKYLAKEX)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
@ -16,8 +37,10 @@ ifeq ($(TARGET_CORE), SKYLAKEX)
override CFLAGS += -fno-asynchronous-unwind-tables
endif
endif
else ifeq ($(TARGET_CORE), HASWELL)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
endif
BUILD_KERNEL = 1
KDIR =

View File

@ -33,9 +33,10 @@ ZAXPYKERNEL = zaxpy.c
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMM_BETA = sgemm_beta_skylakex.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)

View File

@ -61,30 +61,36 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset = c;
if (beta == ZERO){
__m512 z_zero;
__m256 y_zero;
z_zero = _mm512_setzero_ps();
y_zero = _mm256_setzero_ps();
j = n;
do {
c_offset1 = c_offset;
c_offset += ldc;
i = m;
#ifdef __AVX2__
while (i >= 32) {
#ifdef __AVX512CD__
__m512 z_zero = _mm512_setzero_ps();
_mm512_storeu_ps(c_offset1, z_zero);
_mm512_storeu_ps(c_offset1 + 16, z_zero);
#else
__m256 y_zero = _mm256_setzero_ps();
_mm256_storeu_ps(c_offset1, y_zero);
_mm256_storeu_ps(c_offset1 + 8, y_zero);
_mm256_storeu_ps(c_offset1 + 16, y_zero);
_mm256_storeu_ps(c_offset1 + 24, y_zero);
#endif
c_offset1 += 32;
i -= 32;
}
while (i >= 8) {
__m256 y_zero = _mm256_setzero_ps();
_mm256_storeu_ps(c_offset1, y_zero);
c_offset1 += 8;
i -= 8;
}
#endif
while (i > 0) {
*c_offset1 = ZERO;
c_offset1 ++;

View File

@ -49,8 +49,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
FLOAT *b_offset;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp9, ctemp13;
a_offset = a;
b_offset = b;