Use AVX512 also for DGEMM

this required switching to the generic gemm_beta code (which is faster anyway on SKX)
for both DGEMM and SGEMM

Performance for the not-retuned version is in the 30% range
This commit is contained in:
Arjan van de Ven 2018-06-03 22:15:09 +00:00
parent ef626c6824
commit 89372e0993
3 changed files with 5154 additions and 2 deletions

View File

@ -2,3 +2,18 @@ include $(KERNELDIR)/KERNEL.HASWELL
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
DTRMMKERNEL = ../generic/trmmkernel_16x2.c
DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = ../generic/gemm_tcopy_16.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c

File diff suppressed because it is too large Load Diff

View File

@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO), %zmm0
vbroadcastss -4 * SIZE(BO), %zmm2
vbroadcastss -3 * SIZE(BO), %zmm3
prefetcht0 A_PR1(AO)
# prefetcht0 A_PR1(AO)
VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
VFMADD231PS_( %zmm6,%zmm3,%zmm0 )
@ -183,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmovups -16 * SIZE(AO), %zmm0
vbroadcastss -4 * SIZE(BO), %zmm2
vbroadcastss -3 * SIZE(BO), %zmm3
prefetcht0 A_PR1(AO)
VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
VFMADD231PS_( %zmm6,%zmm3,%zmm0 )