added highly optimized dgemm_kernel for HASWELL

This commit is contained in:
wernsaar 2013-10-28 10:23:47 +01:00
parent fe8c5666f9
commit a77c71eaf5
5 changed files with 3514 additions and 58 deletions

View File

@ -334,13 +334,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) #if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

View File

@ -368,8 +368,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;
#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) #if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;

View File

@ -1,63 +1,43 @@
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMKERNEL = dgemm_kernel_4x4_haswell.S
DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPY =
#DGEMMONCOPY = gemm_ncopy_4.S DGEMMITCOPY =
DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
#DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ =
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ =
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S
ZGEMMINCOPY = ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMITCOPY = ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@ -78,7 +58,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

File diff suppressed because it is too large Load Diff

11
param.h
View File

@ -1182,14 +1182,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 4
#define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2
@ -1221,17 +1221,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#define SGEMM_DEFAULT_P 768 #define SGEMM_DEFAULT_P 768
#define DGEMM_DEFAULT_P 192 #define DGEMM_DEFAULT_P 512
#define CGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 384
#define ZGEMM_DEFAULT_P 192 #define ZGEMM_DEFAULT_P 192
#define SGEMM_DEFAULT_Q 168 #define SGEMM_DEFAULT_Q 168
#define DGEMM_DEFAULT_Q 128 #define DGEMM_DEFAULT_Q 256
#define CGEMM_DEFAULT_Q 168 #define CGEMM_DEFAULT_Q 168
#define ZGEMM_DEFAULT_Q 168 #define ZGEMM_DEFAULT_Q 168
#define SGEMM_DEFAULT_R sgemm_r #define SGEMM_DEFAULT_R sgemm_r
#define DGEMM_DEFAULT_R dgemm_r //#define DGEMM_DEFAULT_R dgemm_r
#define DGEMM_DEFAULT_R 13824
#define CGEMM_DEFAULT_R cgemm_r #define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r #define ZGEMM_DEFAULT_R zgemm_r