added highly optimized dgemm_kernel for HASWELL
This commit is contained in:
parent
fe8c5666f9
commit
a77c71eaf5
|
@ -334,13 +334,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
|
||||||
#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
|
#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
|
||||||
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
|
|
||||||
else
|
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
else
|
else
|
||||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
|
|
@ -368,8 +368,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||||
|
|
||||||
#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
|
#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
|
||||||
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
|
|
||||||
else
|
|
||||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
else
|
else
|
||||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||||
|
|
|
@ -1,63 +1,43 @@
|
||||||
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
|
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||||
SGEMMINCOPY =
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
SGEMMITCOPY =
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
SGEMMINCOPYOBJ =
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMITCOPYOBJ =
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
|
|
||||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
DGEMMKERNEL = dgemm_kernel_4x4_haswell.S
|
||||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
DGEMMINCOPY =
|
||||||
#DGEMMONCOPY = gemm_ncopy_4.S
|
DGEMMITCOPY =
|
||||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
#DGEMMOTCOPY = gemm_tcopy_4.S
|
DGEMMINCOPYOBJ =
|
||||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
DGEMMITCOPYOBJ =
|
||||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
|
|
||||||
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
|
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
|
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
|
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S
|
||||||
ZGEMMINCOPY =
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
ZGEMMITCOPY =
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMINCOPYOBJ =
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMITCOPYOBJ =
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
|
|
||||||
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
|
|
||||||
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
|
|
||||||
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
|
|
||||||
|
|
||||||
#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
|
|
||||||
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
|
|
||||||
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
|
|
||||||
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
|
|
||||||
|
|
||||||
#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
|
|
||||||
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
|
|
||||||
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
|
|
||||||
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
|
|
||||||
|
|
||||||
#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
|
|
||||||
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
|
|
||||||
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
|
|
||||||
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
@ -78,7 +58,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
|
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
|
||||||
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
11
param.h
11
param.h
|
@ -1182,14 +1182,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define QGEMM_DEFAULT_UNROLL_M 2
|
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define XGEMM_DEFAULT_UNROLL_M 1
|
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
#define DGEMM_DEFAULT_UNROLL_N 2
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
#define QGEMM_DEFAULT_UNROLL_N 2
|
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
@ -1221,17 +1221,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 768
|
#define SGEMM_DEFAULT_P 768
|
||||||
#define DGEMM_DEFAULT_P 192
|
#define DGEMM_DEFAULT_P 512
|
||||||
#define CGEMM_DEFAULT_P 384
|
#define CGEMM_DEFAULT_P 384
|
||||||
#define ZGEMM_DEFAULT_P 192
|
#define ZGEMM_DEFAULT_P 192
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 168
|
#define SGEMM_DEFAULT_Q 168
|
||||||
#define DGEMM_DEFAULT_Q 128
|
#define DGEMM_DEFAULT_Q 256
|
||||||
#define CGEMM_DEFAULT_Q 168
|
#define CGEMM_DEFAULT_Q 168
|
||||||
#define ZGEMM_DEFAULT_Q 168
|
#define ZGEMM_DEFAULT_Q 168
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_R sgemm_r
|
#define SGEMM_DEFAULT_R sgemm_r
|
||||||
#define DGEMM_DEFAULT_R dgemm_r
|
//#define DGEMM_DEFAULT_R dgemm_r
|
||||||
|
#define DGEMM_DEFAULT_R 13824
|
||||||
#define CGEMM_DEFAULT_R cgemm_r
|
#define CGEMM_DEFAULT_R cgemm_r
|
||||||
#define ZGEMM_DEFAULT_R zgemm_r
|
#define ZGEMM_DEFAULT_R zgemm_r
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue