From 961d25e9c7e4a1758adb1dbeaa15187de69dd052 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 May 2018 22:54:39 +0200 Subject: [PATCH 01/86] Use the new zrot.c on POWER8 for crot as well fixes #1571 (the old zrot.S assembly does not handle incx=0 correctly) --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 00ff8682a..1aa061078 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = ../arm/zrot.c +CROTKERNEL = zrot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c From 43e592ceb38a56716279a6514ceca1ec9bdb0865 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 24 May 2018 20:56:24 +0800 Subject: [PATCH 02/86] Add -lm for Android. Conflicts: exports/Makefile --- exports/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 53d4f75bb..127b05057 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -128,6 +128,8 @@ so : ../$(LIBSONAME) ifeq ($(OSNAME), Android) INTERNALNAME = $(LIBPREFIX).so +FEXTRALIB += -lm +EXTRALIB += -lm else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif From 908d40be715bfb252972a0a4abf27726a729945f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 29 May 2018 14:27:46 +0200 Subject: [PATCH 03/86] Adapt lapack-test and blas-test to changes in netlib directory layout partial fix for #1574 - the problem with lapack_testing.py looks like an upstream bug --- Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index c0e5fbcf8..380ba1ce8 100644 --- a/Makefile +++ b/Makefile @@ -294,9 +294,10 @@ endif lapack-test : (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc ifneq ($(CROSS), 1) - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) endif @@ -308,9 +309,9 @@ lapack-runtest: blas-test: - (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing - (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) dummy : From a7dbd4c57d22b580b32f3a97b0b327bf2fedf551 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 11:19:33 +0200 Subject: [PATCH 04/86] Fix paths to LIN and EIG tests should fix 1574 --- lapack-netlib/lapack_testing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 3c917482d..5d07e1e87 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -257,16 +257,16 @@ for dtype in range_prec: else: if dtest==16: # LIN TESTS - cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==17: # PROTO LIN TESTS - cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==18: # PROTO LIN TESTS - cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS - cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" if (not just_errors and not short_summary): print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing From 5fae96fb70cbc1205e50220f77722ac5ff92f0d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:43:45 +0200 Subject: [PATCH 05/86] Update version to 0.3.1.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b5789119a..f49f20513 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 0.dev) +set(OpenBLAS_PATCH_VERSION 1.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From b491b10057196c5735a261608ec110b1bbd134d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:44:36 +0200 Subject: [PATCH 06/86] Update version to 0.3.1.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 12734464b..1b4b8eb63 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.0.dev +VERSION = 0.3.1.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d1b7be14aa9b57ca4df9c00cdb4611974729b3be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:52:04 +0200 Subject: [PATCH 07/86] Handle INCX=0,INCY=0 case Fixes #1575 (sswap/dswap failing the swap utest on x86) as suggested by atsampson. --- kernel/x86/swap.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S index 54b00b33e..d3cf04942 100644 --- a/kernel/x86/swap.S +++ b/kernel/x86/swap.S @@ -138,6 +138,14 @@ /* INCX != 1 or INCY != 1 */ .L14: + cmpl $0, %ebx + jne .L141 + cmpl $0, %ecx + jne .L141 +/* INCX == 0 and INCY == 0 */ + jmp .L27 + +.L141 movl %edx, %eax sarl $2, %eax jle .L28 From a91f1587b9be6c9bbc403a79970d3e2a03bf866c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 13:26:00 +0200 Subject: [PATCH 08/86] Work around name clash with Windows10's winnt.h fixes #1503 --- driver/level3/Makefile | 48 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 352225206..e320092e3 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) From 2fc748bf7200ca53d66d43107dc2c732685519d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 13:41:12 +0200 Subject: [PATCH 09/86] Restore optimized swap kernel now that we have a proper fix --- kernel/x86/KERNEL.NEHALEM | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86/KERNEL.NEHALEM b/kernel/x86/KERNEL.NEHALEM index 835520efb..65b03ae50 100644 --- a/kernel/x86/KERNEL.NEHALEM +++ b/kernel/x86/KERNEL.NEHALEM @@ -1,3 +1 @@ include $(KERNELDIR)/KERNEL.PENRYN -SSWAPKERNEL = ../arm/swap.c -DSWAPKERNEL = ../arm/swap.c From 7df8c4f76fa7aadd8d1bce1d99fe826a4826d775 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 17:23:08 +0200 Subject: [PATCH 10/86] typo fix --- kernel/x86/swap.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S index d3cf04942..e30c27898 100644 --- a/kernel/x86/swap.S +++ b/kernel/x86/swap.S @@ -145,7 +145,7 @@ /* INCX == 0 and INCY == 0 */ jmp .L27 -.L141 +.L141: movl %edx, %eax sarl $2, %eax jle .L28 From e2a8c35e5a6897e5aebf5e2fb8ba18f94735c89a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:08:14 +0200 Subject: [PATCH 11/86] Fixes from netlib PR253 LAPACKE interfaces for Aasen's functions now call ?sytrf_aa and ?hetrf_aa instead of ?sytrf and ?hetrf --- lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c index b4a7595d8..e4d538779 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_chetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_chetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_chetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c index d4f24142b..f6661c85c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_csytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_csytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_csy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_csytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c index cbf97b632..e72bfa6de 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c @@ -40,7 +40,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_dsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -55,7 +55,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_dsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -67,7 +67,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_dsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c index d68cb17c1..182946a45 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c @@ -40,7 +40,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_ssytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -55,7 +55,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_ssytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -67,7 +67,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_ssytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c index 5214217fb..dbad2d81e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zhetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zhetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_zhetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c index 29d75319e..03726c63e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_zsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_zsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } From 677e42d7b0c6b6c40af94268fbb9d9be60f7af0a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:12:59 +0200 Subject: [PATCH 12/86] Fixes from netlib PR 253 When minimal workspace is given in ?hesv_aa, ?sysv_aa, ?hesv_aa_2stage, ?sysv_aa_2stage, now no error is given Quick return for ?laqr1 --- lapack-netlib/SRC/cgejsv.f | 4 ++-- lapack-netlib/SRC/chesv_aa.f | 5 ++--- lapack-netlib/SRC/chesv_aa_2stage.f | 15 +++++++++------ lapack-netlib/SRC/chetrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/chetrs_aa_2stage.f | 1 + lapack-netlib/SRC/cla_syamv.f | 2 +- lapack-netlib/SRC/claqr1.f | 7 +++++++ lapack-netlib/SRC/csysv_aa.f | 3 --- lapack-netlib/SRC/csysv_aa_2stage.f | 15 +++++++++------ lapack-netlib/SRC/csytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/csytri2.f | 6 +++--- lapack-netlib/SRC/csytrs_aa_2stage.f | 1 + lapack-netlib/SRC/ctrevc3.f | 18 +++++++++--------- lapack-netlib/SRC/dgelqt.f | 2 +- lapack-netlib/SRC/dla_syamv.f | 2 +- lapack-netlib/SRC/dlaqr1.f | 7 +++++++ lapack-netlib/SRC/dsysv_aa.f | 3 --- lapack-netlib/SRC/dsysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/dsytrf_aa_2stage.f | 8 ++++++-- lapack-netlib/SRC/dsytri2.f | 6 +++--- lapack-netlib/SRC/dsytrs_aa_2stage.f | 1 + lapack-netlib/SRC/dtrevc3.f | 4 ++-- lapack-netlib/SRC/iparmq.f | 4 ++-- lapack-netlib/SRC/sla_syamv.f | 2 +- lapack-netlib/SRC/slaqr1.f | 7 +++++++ lapack-netlib/SRC/ssysv_aa.f | 3 --- lapack-netlib/SRC/ssysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/ssytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/ssytri2.f | 4 ++-- lapack-netlib/SRC/ssytrs_aa_2stage.f | 1 + lapack-netlib/SRC/strevc3.f | 12 ++++++------ lapack-netlib/SRC/zgejsv.f | 4 ++-- lapack-netlib/SRC/zhesv_aa.f | 5 ++--- lapack-netlib/SRC/zhesv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/zhetrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/zhetrs_aa_2stage.f | 7 ++++--- lapack-netlib/SRC/zla_syamv.f | 2 +- lapack-netlib/SRC/zlaqr1.f | 7 +++++++ lapack-netlib/SRC/zsysv_aa.f | 3 --- lapack-netlib/SRC/zsysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/zsytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/zsytri2.f | 2 +- lapack-netlib/SRC/zsytrs_aa_2stage.f | 1 + 43 files changed, 155 insertions(+), 101 deletions(-) diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 8eb43cf50..a7b1c451c 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -701,7 +701,7 @@ LWSVDJ = MAX( 2 * N, 1 ) LWSVDJV = MAX( 2 * N, 1 ) * .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ - LRWQP3 = N + LRWQP3 = 2 * N LRWCON = N LRWSVDJ = N IF ( LQUERY ) THEN @@ -939,7 +939,7 @@ END IF END IF MINWRK = MAX( 2, MINWRK ) - OPTWRK = MAX( 2, OPTWRK ) + OPTWRK = MAX( OPTWRK, MINWRK ) IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 END IF diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 0bf636b48..470f910bc 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -209,6 +209,8 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 + ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + INFO = -10 END IF * IF( INFO.EQ.0 ) THEN @@ -219,9 +221,6 @@ LWKOPT_HETRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 057d9c57a..05f6b7bb7 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN @@ -270,6 +271,8 @@ END IF * WORK( 1 ) = LWKOPT +* + RETURN * * End of CHESV_AA_2STAGE * diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 0fa2ae3a0..ce34d73cc 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -658,6 +660,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of CHETRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/chetrs_aa_2stage.f b/lapack-netlib/SRC/chetrs_aa_2stage.f index 3f8576673..05d09275b 100644 --- a/lapack-netlib/SRC/chetrs_aa_2stage.f +++ b/lapack-netlib/SRC/chetrs_aa_2stage.f @@ -87,6 +87,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/cla_syamv.f b/lapack-netlib/SRC/cla_syamv.f index e1d3df960..695b5e478 100644 --- a/lapack-netlib/SRC/cla_syamv.f +++ b/lapack-netlib/SRC/cla_syamv.f @@ -241,7 +241,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'SSYMV ', INFO ) + CALL XERBLA( 'CLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/claqr1.f b/lapack-netlib/SRC/claqr1.f index b76bedf60..977947196 100644 --- a/lapack-netlib/SRC/claqr1.f +++ b/lapack-netlib/SRC/claqr1.f @@ -142,6 +142,13 @@ CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) ) IF( S.EQ.RZERO ) THEN diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f index 9cd669d33..87be734cc 100644 --- a/lapack-netlib/SRC/csysv_aa.f +++ b/lapack-netlib/SRC/csysv_aa.f @@ -221,9 +221,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f index cba57fc3e..a13349824 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.f +++ b/lapack-netlib/SRC/csysv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN @@ -270,6 +271,8 @@ END IF * WORK( 1 ) = LWKOPT +* + RETURN * * End of CSYSV_AA_2STAGE * diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f index 0a6bfbe31..0d0bd156c 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.f +++ b/lapack-netlib/SRC/csytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -662,6 +664,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of CSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/csytri2.f b/lapack-netlib/SRC/csytri2.f index 4c6baaa3e..4bd8e4f99 100644 --- a/lapack-netlib/SRC/csytri2.f +++ b/lapack-netlib/SRC/csytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -163,7 +163,7 @@ UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'CSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/csytrs_aa_2stage.f b/lapack-netlib/SRC/csytrs_aa_2stage.f index 03bccda82..d025c08fe 100644 --- a/lapack-netlib/SRC/csytrs_aa_2stage.f +++ b/lapack-netlib/SRC/csytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f index c06b40477..a134c1a50 100644 --- a/lapack-netlib/SRC/ctrevc3.f +++ b/lapack-netlib/SRC/ctrevc3.f @@ -27,8 +27,8 @@ * .. * .. Array Arguments .. * LOGICAL SELECT( * ) -* REAL RWORK( * ) -* COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), +* REAL RWORK( * ) +* COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), * $ WORK( * ) * .. * @@ -258,17 +258,17 @@ * .. * .. Array Arguments .. LOGICAL SELECT( * ) - REAL RWORK( * ) - COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), + REAL RWORK( * ) + COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), $ WORK( * ) * .. * * ===================================================================== * * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) - COMPLEX CZERO, CONE + COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER NBMIN, NBMAX @@ -277,13 +277,13 @@ * .. Local Scalars .. LOGICAL ALLV, BOTHV, LEFTV, LQUERY, OVER, RIGHTV, SOMEV INTEGER I, II, IS, J, K, KI, IV, MAXWRK, NB - REAL OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL - COMPLEX CDUM + REAL OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL + COMPLEX CDUM * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ICAMAX - REAL SLAMCH, SCASUM + REAL SLAMCH, SCASUM EXTERNAL LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM * .. * .. External Subroutines .. diff --git a/lapack-netlib/SRC/dgelqt.f b/lapack-netlib/SRC/dgelqt.f index 2124f3dc3..5b4ee65b5 100644 --- a/lapack-netlib/SRC/dgelqt.f +++ b/lapack-netlib/SRC/dgelqt.f @@ -158,7 +158,7 @@ INTEGER I, IB, IINFO, K * .. * .. External Subroutines .. - EXTERNAL DGEQRT2, DGELQT3, DGEQRT3, DLARFB, XERBLA + EXTERNAL DGELQT3, DLARFB, XERBLA * .. * .. Executable Statements .. * diff --git a/lapack-netlib/SRC/dla_syamv.f b/lapack-netlib/SRC/dla_syamv.f index 29566a6e9..bb6dbe288 100644 --- a/lapack-netlib/SRC/dla_syamv.f +++ b/lapack-netlib/SRC/dla_syamv.f @@ -230,7 +230,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'DSYMV ', INFO ) + CALL XERBLA( 'DLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/dlaqr1.f b/lapack-netlib/SRC/dlaqr1.f index 81a462fb3..795b072ab 100644 --- a/lapack-netlib/SRC/dlaqr1.f +++ b/lapack-netlib/SRC/dlaqr1.f @@ -147,6 +147,13 @@ INTRINSIC ABS * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) ) IF( S.EQ.ZERO ) THEN diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index cbccd5e65..7192928c6 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -221,9 +221,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index ac3c77d76..05e538f0b 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -107,6 +107,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -126,7 +127,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -152,6 +153,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -235,19 +237,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index f5f06cc1d..25fc1a2eb 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -109,6 +110,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -128,10 +130,10 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the -*> row and column IPIV(k). +*> row and column IPIV2(k). *> \endverbatim *> *> \param[out] INFO @@ -641,6 +643,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL DGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of DSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index 9aa21a854..23f8b9fa2 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -163,7 +163,7 @@ UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/dsytrs_aa_2stage.f b/lapack-netlib/SRC/dsytrs_aa_2stage.f index caff5d4ad..bb283cb95 100644 --- a/lapack-netlib/SRC/dsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f index 745f636d0..957baf4f0 100644 --- a/lapack-netlib/SRC/dtrevc3.f +++ b/lapack-netlib/SRC/dtrevc3.f @@ -45,9 +45,9 @@ *> The right eigenvector x and the left eigenvector y of T corresponding *> to an eigenvalue w are defined by: *> -*> T*x = w*x, (y**H)*T = w*(y**H) +*> T*x = w*x, (y**T)*T = w*(y**T) *> -*> where y**H denotes the conjugate transpose of y. +*> where y**T denotes the transpose of the vector y. *> The eigenvalues are not input to this routine, but are read directly *> from the diagonal blocks of T. *> diff --git a/lapack-netlib/SRC/iparmq.f b/lapack-netlib/SRC/iparmq.f index e576e0db0..a9212b3e0 100644 --- a/lapack-netlib/SRC/iparmq.f +++ b/lapack-netlib/SRC/iparmq.f @@ -104,13 +104,13 @@ *> *> \param[in] NAME *> \verbatim -*> NAME is character string +*> NAME is CHARACTER string *> Name of the calling subroutine *> \endverbatim *> *> \param[in] OPTS *> \verbatim -*> OPTS is character string +*> OPTS is CHARACTER string *> This is a concatenation of the string arguments to *> TTQRE. *> \endverbatim diff --git a/lapack-netlib/SRC/sla_syamv.f b/lapack-netlib/SRC/sla_syamv.f index d40e7bd95..4459f4d8b 100644 --- a/lapack-netlib/SRC/sla_syamv.f +++ b/lapack-netlib/SRC/sla_syamv.f @@ -230,7 +230,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'SSYMV ', INFO ) + CALL XERBLA( 'SLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/slaqr1.f b/lapack-netlib/SRC/slaqr1.f index 7d7d851ee..2de33849d 100644 --- a/lapack-netlib/SRC/slaqr1.f +++ b/lapack-netlib/SRC/slaqr1.f @@ -147,6 +147,13 @@ INTRINSIC ABS * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) ) IF( S.EQ.ZERO ) THEN diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index abf52b143..e470f5883 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -220,9 +220,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index a738c7415..43d937141 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -106,6 +106,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -125,7 +126,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -151,6 +152,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -234,19 +236,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index a92974930..0e0f6edb7 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -641,6 +643,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL SGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of SSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 97b539005..4b9ea4e7b 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO diff --git a/lapack-netlib/SRC/ssytrs_aa_2stage.f b/lapack-netlib/SRC/ssytrs_aa_2stage.f index c9c7181f2..d271b9481 100644 --- a/lapack-netlib/SRC/ssytrs_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f index 0df1189f0..525978071 100644 --- a/lapack-netlib/SRC/strevc3.f +++ b/lapack-netlib/SRC/strevc3.f @@ -27,7 +27,7 @@ * .. * .. Array Arguments .. * LOGICAL SELECT( * ) -* REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), +* REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), * $ WORK( * ) * .. * @@ -45,9 +45,9 @@ *> The right eigenvector x and the left eigenvector y of T corresponding *> to an eigenvalue w are defined by: *> -*> T*x = w*x, (y**H)*T = w*(y**H) +*> T*x = w*x, (y**T)*T = w*(y**T) *> -*> where y**H denotes the conjugate transpose of y. +*> where y**T denotes the transpose of the vector y. *> The eigenvalues are not input to this routine, but are read directly *> from the diagonal blocks of T. *> @@ -251,14 +251,14 @@ * .. * .. Array Arguments .. LOGICAL SELECT( * ) - REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), + REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), $ WORK( * ) * .. * * ===================================================================== * * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) INTEGER NBMIN, NBMAX PARAMETER ( NBMIN = 8, NBMAX = 128 ) @@ -268,7 +268,7 @@ $ RIGHTV, SOMEV INTEGER I, IERR, II, IP, IS, J, J1, J2, JNXT, K, KI, $ IV, MAXWRK, NB, KI2 - REAL BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE, + REAL BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE, $ SMIN, SMLNUM, ULP, UNFL, VCRIT, VMAX, WI, WR, $ XNORM * .. diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index e8418c680..d553da90b 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -704,7 +704,7 @@ LWSVDJ = MAX( 2 * N, 1 ) LWSVDJV = MAX( 2 * N, 1 ) * .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ - LRWQP3 = N + LRWQP3 = 2 * N LRWCON = N LRWSVDJ = N IF ( LQUERY ) THEN @@ -942,7 +942,7 @@ END IF END IF MINWRK = MAX( 2, MINWRK ) - OPTWRK = MAX( 2, OPTWRK ) + OPTWRK = MAX( MINWRK, OPTWRK ) IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 END IF diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index bbd0fdff4..8511f0e7d 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -209,6 +209,8 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 + ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + INFO = -10 END IF * IF( INFO.EQ.0 ) THEN @@ -219,9 +221,6 @@ LWKOPT_HETRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index a34440029..ed221dc69 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -106,6 +106,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -125,7 +126,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -151,6 +152,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -240,19 +242,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 4d62198d6..73c0ebe9a 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -657,6 +659,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of ZHETRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/zhetrs_aa_2stage.f b/lapack-netlib/SRC/zhetrs_aa_2stage.f index 02e17476f..7fcee1118 100644 --- a/lapack-netlib/SRC/zhetrs_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrs_aa_2stage.f @@ -69,7 +69,7 @@ *> *> \param[in] A *> \verbatim -*> A is COMPLEX*16array, dimension (LDA,N) +*> A is COMPLEX*16 array, dimension (LDA,N) *> Details of factors computed by ZHETRF_AA_2STAGE. *> \endverbatim *> @@ -81,12 +81,13 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (LTB) *> Details of factors computed by ZHETRF_AA_2STAGE. *> \endverbatim *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> @@ -106,7 +107,7 @@ *> *> \param[in,out] B *> \verbatim -*> B is COMPLEX*16array, dimension (LDB,NRHS) +*> B is COMPLEX*16 array, dimension (LDB,NRHS) *> On entry, the right hand side matrix B. *> On exit, the solution matrix X. *> \endverbatim diff --git a/lapack-netlib/SRC/zla_syamv.f b/lapack-netlib/SRC/zla_syamv.f index 02958bef3..cfdb3cdc8 100644 --- a/lapack-netlib/SRC/zla_syamv.f +++ b/lapack-netlib/SRC/zla_syamv.f @@ -241,7 +241,7 @@ INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'DSYMV ', INFO ) + CALL XERBLA( 'ZLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/zlaqr1.f b/lapack-netlib/SRC/zlaqr1.f index 03afb87aa..34341cb10 100644 --- a/lapack-netlib/SRC/zlaqr1.f +++ b/lapack-netlib/SRC/zlaqr1.f @@ -142,6 +142,13 @@ CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) ) IF( S.EQ.RZERO ) THEN diff --git a/lapack-netlib/SRC/zsysv_aa.f b/lapack-netlib/SRC/zsysv_aa.f index 10693c731..325d07c54 100644 --- a/lapack-netlib/SRC/zsysv_aa.f +++ b/lapack-netlib/SRC/zsysv_aa.f @@ -221,9 +221,6 @@ LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f index fcf9bc870..029ed587d 100644 --- a/lapack-netlib/SRC/zsysv_aa_2stage.f +++ b/lapack-netlib/SRC/zsysv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f index 1f916726e..d3486c1a7 100644 --- a/lapack-netlib/SRC/zsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -662,6 +664,8 @@ c $ (J+1)*NB+1, (J+1)*NB+KB, IPIV, 1 ) * * Factor the band matrix CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of ZSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/zsytri2.f b/lapack-netlib/SRC/zsytri2.f index d5aabd43a..e7303c90b 100644 --- a/lapack-netlib/SRC/zsytri2.f +++ b/lapack-netlib/SRC/zsytri2.f @@ -163,7 +163,7 @@ UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'ZSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/zsytrs_aa_2stage.f b/lapack-netlib/SRC/zsytrs_aa_2stage.f index c5d894753..fa15eee90 100644 --- a/lapack-netlib/SRC/zsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> From c5b13d4e10d38eb1bad56aac21bc9ffcf0b577df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:14:45 +0200 Subject: [PATCH 13/86] Fixes from netlib PR 253 --- lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index 5698bcf94..f6d990d1c 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -218,7 +218,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DERRSY, DLACPY, DLARHS, - $ DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE + $ DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE, $ DSYTRS_AA_2STAGE, XLAENV * .. * .. Intrinsic Functions .. diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 0be321eb0..898422654 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -204,7 +204,7 @@ * .. External Subroutines .. EXTERNAL ALADHD, ALAERH, ALASVM, XLAENV, DERRVX, $ DGET04, DLACPY, DLARHS, DLATB4, DLATMS, - $ DSYSV_AA_2STAGE, CHET01_AA, DPOT02, + $ DSYSV_AA_2STAGE, DPOT02, $ DSYTRF_AA_2STAGE * .. * .. Scalars in Common .. diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index d8d9dc0a9..70e8ff6b8 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -203,7 +203,7 @@ * .. * .. External Subroutines .. EXTERNAL ALADHD, ALAERH, ALASVM, XLAENV, SERRVX, - $ CGET04, SLACPY, SLARHS, SLATB4, SLATMS, + $ SLACPY, SLARHS, SLATB4, SLATMS, $ SSYSV_AA_2STAGE, SSYT01_AA, SPOT02, $ SSYTRF_AA_2STAGE * .. diff --git a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f index d4d8c2939..87fc47f71 100644 --- a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f @@ -217,8 +217,8 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, CERRSY, ZLACPY, ZLARHS, - $ CLATB4, ZLATMS, ZSYT02, ZSYT01, + EXTERNAL ALAERH, ALAHD, ALASUM, ZERRSY, ZLACPY, ZLARHS, + $ ZLATB4, ZLATMS, ZSYT02, ZSYT01, $ ZSYTRF_AA_2STAGE, ZSYTRS_AA_2STAGE, $ XLAENV * .. From a8002e283a5874946bb464a45045d4651081e675 Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Fri, 1 Jun 2018 23:20:00 +0100 Subject: [PATCH 14/86] Revert "take out unused variables" This reverts commit e5752ff9b322c665a7393d6109c2da7ad6ee2523. The variables i and n are used in the `#if !__GLIBC_PREREQ(2, 7)` branch. Closes gh-1586. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ef328b945..d69e52e97 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -180,7 +180,7 @@ int get_num_procs(void) { cpu_set_t *cpusetp; size_t size; int ret; -// int i,n; +int i,n; if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) From 99c7bba8e404fcf697f00bc986e106892eff47ad Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 07:24:29 +0000 Subject: [PATCH 15/86] Initial support for SkylakeX / AVX512 This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server) target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set, which brings 2 basic things: 1) 512 bit wide SIMD (2x width of AVX2) 2) 32 SIMD registers (2x the number on AVX2) This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel to AVX512VL; more will follow later but this patch aims to get the infrastructure in place for this "later". Full performance tuning has not been done yet; with more registers and wider SIMD it's in theory possible to retune the kernels but even without that there's an interesting enough performance increase (30-40% range) with just this change. --- Makefile.system | 8 +- TargetList.txt | 1 + cmake/arch.cmake | 3 + cmake/system.cmake | 2 +- cpuid.h | 3 + cpuid_x86.c | 2 + driver/others/dynamic.c | 2 + driver/others/parameter.c | 4 +- getarch.c | 15 + kernel/CMakeLists.txt | 2 +- kernel/Makefile.L3 | 4 + kernel/setparam-ref.c | 16 + kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SKYLAKEX | 4 + kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 6812 ++++++++++++++++++++ kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 119 + 57 files changed, 7034 insertions(+), 47 deletions(-) create mode 100644 kernel/x86_64/KERNEL.SKYLAKEX create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.S diff --git a/Makefile.system b/Makefile.system index 7bfac1fa8..b005b80c9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -62,6 +62,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -95,6 +98,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET_CORE), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -467,7 +473,7 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN +DYNAMIC_CORE += HASWELL ZEN SKYLAKEX endif endif diff --git a/TargetList.txt b/TargetList.txt index aeeaa9ede..31e4881c4 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -20,6 +20,7 @@ DUNNINGTON NEHALEM SANDYBRIDGE HASWELL +SKYLAKEX ATOM b)AMD CPU: diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 798a9ef82..527d2bec6 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -56,6 +56,9 @@ if (DYNAMIC_ARCH) if (NOT NO_AVX2) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () + if (NOT NO_AVX512) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) + endif () endif () if (NOT DYNAMIC_CORE) diff --git a/cmake/system.cmake b/cmake/system.cmake index 645895671..c21fe7c14 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") diff --git a/cpuid.h b/cpuid.h index 1dacc49ba..a6bc211f3 100644 --- a/cpuid.h +++ b/cpuid.h @@ -115,6 +115,7 @@ #define CORE_STEAMROLLER 25 #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 +#define CORE_SKYLAKEX 28 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -137,6 +138,7 @@ #define HAVE_AVX (1 << 18) #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) +#define HAVE_AVX512VL (1 << 21) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -211,5 +213,6 @@ typedef struct { #define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 +#define CPUTYPE_SKYLAKEX 52 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 342c56525..5f49e7715 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -50,6 +50,8 @@ #ifdef NO_AVX #define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CORE_HASWELL CORE_NEHALEM +#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM +#define CORE_SKYLAKEX CORE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index fbf7cd40e..a0c9794b1 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -74,6 +74,7 @@ extern gotoblas_t gotoblas_STEAMROLLER; extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; @@ -83,6 +84,7 @@ extern gotoblas_t gotoblas_ZEN; //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM +#define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 31a48644f..e7332c0c4 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -167,7 +167,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 992fc2b95..fcffe63e2 100644 --- a/getarch.c +++ b/getarch.c @@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "HASWELL" #endif +#ifdef FORCE_SKYLAKEX +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SKYLAKEX" +#define ARCHCONFIG "-DSKYLAKEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" +#define LIBNAME "skylakex" +#define CORENAME "SKYLAKEX" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index c06d1eae8..947114ebe 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") + if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") set(USE_TRMM true) endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 066426396..b37e536ef 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif +ifeq ($(CORE), SKYLAKEX) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b6c5b54de..9030d7c6d 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -871,6 +871,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SKYLAKEX + +#ifdef DEBUG + fprintf(stderr, "SkylakeX\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 0b475afa2..34653d400 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index e98854f34..492f34344 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 086852cfc..6840c54ad 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 2dd8ad08b..361ccf603 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 154276f6a..11825429e 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index acdcd6e22..4c054f399 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index da561b583..e67496736 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index a11b0286a..498057697 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 787ab5982..f3072983d 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 9a3b0cbd7..879ae9c38 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index bd7a78b5a..6c308197b 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX new file mode 100644 index 000000000..744831d67 --- /dev/null +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -0,0 +1,4 @@ +include $(KERNELDIR)/KERNEL.HASWELL + +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index b1ec19bd3..586d05ac2 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 5f01f7eeb..93fca0a0d 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 770c955b2..d81766cd4 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index d75e58fdd..6bdea6787 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 9b9179da0..72af99809 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 4bde62824..b4acdccd2 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "daxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 8162a5d83..059549028 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 1b9ca7a60..309fbe767 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 6b99d6fdd..a7478e3a8 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 428558617..2c7b3b17c 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dscal_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 3e8db3fa3..73099462c 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 61cb77a64..431e4bb3f 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89fe408a..d89c4070d 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index b6f3c21af..c3ab2ffe6 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S new file mode 100644 index 000000000..1fab892ca --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -0,0 +1,6812 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm2 + vbroadcastss -1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm2 + vbroadcastss 1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm12,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm3,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro KERNEL16x6_SUB4 + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm7 + vbroadcastss -1 * SIZE(BO), %zmm9 + VFMADD231PS_( %zmm8,%zmm7,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm9,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm11 + vbroadcastss 1 * SIZE(BO), %zmm13 + VFMADD231PS_( %zmm12,%zmm11,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm13,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm16 + vbroadcastss -3 * SIZE(BO), %zmm17 + + VFMADD231PS_( %zmm4,%zmm16,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm17,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm18 + vbroadcastss -1 * SIZE(BO), %zmm19 + VFMADD231PS_( %zmm8,%zmm18,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm19,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm20 + vbroadcastss 1 * SIZE(BO), %zmm21 + VFMADD231PS_( %zmm12,%zmm20,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm21,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm22 + vbroadcastss -3 * SIZE(BO), %zmm23 + + VFMADD231PS_( %zmm4,%zmm22,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm23,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm24 + vbroadcastss -1 * SIZE(BO), %zmm25 + VFMADD231PS_( %zmm8,%zmm24,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm25,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm26 + vbroadcastss 1 * SIZE(BO), %zmm27 + VFMADD231PS_( %zmm12,%zmm26,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm27,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm28 + vbroadcastss -3 * SIZE(BO), %zmm29 + + VFMADD231PS_( %zmm4,%zmm28,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm29,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm30 + vbroadcastss -1 * SIZE(BO), %zmm31 + VFMADD231PS_( %zmm8,%zmm30,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm31,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm1 + vbroadcastss 1 * SIZE(BO), %zmm5 + VFMADD231PS_( %zmm12,%zmm1,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm5,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + vmulps %zmm0 , %zmm12, %zmm12 + vmulps %zmm0 , %zmm14, %zmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO1, LDC,2), %zmm8,%zmm8 + + vaddps (CO2), %zmm10,%zmm10 + + vaddps (CO2, LDC), %zmm12,%zmm12 + + vaddps (CO2, LDC,2), %zmm14,%zmm14 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO1, LDC,2) + + vmovups %zmm10, (CO2) + + vmovups %zmm12, (CO2, LDC) + + vmovups %zmm14, (CO2, LDC,2) + +.endm + + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO2), %zmm8,%zmm8 + + vaddps (CO2, LDC), %zmm10,%zmm10 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO2) + + vmovups %zmm10, (CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + +#endif + + vmovups %zmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index fd028964b..65305ac59 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index f04d461f7..065e5b385 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 199d8a517..73ae001ea 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 691a071f7..f37c251a1 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 8cae3fc1b..8a5c44c9b 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index d7091624d..0c40a3435 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 3549b9863..7a2eeace5 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 882b035a9..0408b577c 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 8cb1d532f..53866cf95 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index d11c76647..ef12569c8 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index f6f88155c..0fedc496b 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 3e4b7d5df..2ab7a671b 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index aa5d8fac0..2a6d0e4c7 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index dd95eea17..e44bd7550 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 75124cf3e..e9f330c36 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index db1a4ff5f..9f0dead18 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 599765a6d..b6106a37d 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 4227d548e..49a5e85e8 100644 --- a/param.h +++ b/param.h @@ -1613,6 +1613,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#endif + +#ifdef SKYLAKEX + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#ifdef WINDOWS_ABI +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 128 +#else +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#endif +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 8 +#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define ZGEMM3M_DEFAULT_UNROLL_N 8 +#define ZGEMM3M_DEFAULT_UNROLL_M 2 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif + + #endif From 00235157339dc5fba2b4194bd660c45257e539e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:22:59 +0200 Subject: [PATCH 16/86] Typo fix (misplaced parenthesis) --- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 361ccf603..e2f731fca 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif From f1fb9a474571846ffc140313dbe5b8ba21925b74 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:48:27 +0200 Subject: [PATCH 17/86] Propagate NO_AVX512 if needed --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index b005b80c9..cec4b44e5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -147,6 +147,10 @@ ifeq ($(NO_AVX2), 1) GETARCH_FLAGS += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +GETARCH_FLAGS += -DNO_AVX512 +endif + ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif From a7d0f49cec68dc3f116feed0320708ae004af4c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:13:25 +0200 Subject: [PATCH 18/86] Add SKYLAKEX to DYNAMIC_CORE list only if AVX512 is available --- Makefile.system | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index cec4b44e5..82e38a6d2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -477,7 +477,12 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN SKYLAKEX +DYNAMIC_CORE += HASWELL ZEN +endif +ifneq ($(NO_AVX512), 1) +ifneq ($(NO_AVX2), 1) +DYNAMIC_CORE += SKYLAKEX +endif endif endif From 5a92b311e05fb938e1fd85dcaf6fbeebc77bd4fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:29:07 +0200 Subject: [PATCH 19/86] Separate Skylake X from Skylake --- cpuid_x86.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 5f49e7715..d0dbe1d24 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1301,6 +1301,19 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) @@ -1558,6 +1571,7 @@ static char *cpuname[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *lowercpuname[] = { @@ -1612,6 +1626,7 @@ static char *lowercpuname[] = { "steamroller", "excavator", "zen", + "skylakex" }; static char *corename[] = { @@ -1643,6 +1658,7 @@ static char *corename[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *corename_lower[] = { @@ -1674,6 +1690,7 @@ static char *corename_lower[] = { "steamroller", "excavator", "zen", + "skylakex" }; @@ -1862,6 +1879,19 @@ int get_coretype(void){ else return CORE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if/support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) From 5a51cf4576df2e065e5517b04369ff10a2a83f58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:41:33 +0200 Subject: [PATCH 20/86] Separate Skylake X from Skylake --- driver/others/dynamic.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a0c9794b1..5e9a24b8b 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -79,6 +79,11 @@ extern gotoblas_t gotoblas_EXCAVATOR; #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; +#ifndef NO_AVX512 +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#endif #endif #else //Use NEHALEM kernels for sandy bridge @@ -286,8 +291,21 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + if (model == 5) { + // Intel Skylake X +#ifndef NO_AVX512 + return $gotoblas_SKYLAKEX; +#else + if(support_avx()) + return &gotoblas_HASWELL; + else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } +#endif //Intel Skylake - if (model == 14 || model == 5) { + if (model == 14) { if(support_avx()) return &gotoblas_HASWELL; else{ @@ -447,7 +465,8 @@ static char *corename[] = { "Haswell", "Steamroller", "Excavator", - "Zen" + "Zen", + "SkylakeX" }; char *gotoblas_corename(void) { @@ -475,7 +494,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_ZEN) return corename[23]; - + if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; return corename[0]; } @@ -505,6 +524,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 24: return (&gotoblas_SKYLAKEX); case 23: return (&gotoblas_ZEN); case 22: return (&gotoblas_EXCAVATOR); case 21: return (&gotoblas_STEAMROLLER); From 83fec56a3f55fa24b2e541549852bdee03d30a0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 00:01:11 +0200 Subject: [PATCH 21/86] Disable AVX512 (Skylake X) support if the build system is too old --- c_check | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/c_check b/c_check index a3b337602..dfe99350a 100644 --- a/c_check +++ b/c_check @@ -201,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); +$no_avx512= 0; +if (($architecture eq "x86") || ($architecture eq "x86_64")) { + $code = '"vaddps %zmm1, %zmm0, %zmm0"'; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + $args = " -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $no_avx512 = 1; + } else { + $no_avx512 = 0; + } + unlink("tmpf.o"); +} + $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -288,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; +print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; From ef626c6824c26415bc074d11325245e72f9e3284 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 00:13:19 +0200 Subject: [PATCH 22/86] typo fix --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5e9a24b8b..2c902d108 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -294,7 +294,7 @@ static gotoblas_t *get_coretype(void){ if (model == 5) { // Intel Skylake X #ifndef NO_AVX512 - return $gotoblas_SKYLAKEX; + return &gotoblas_SKYLAKEX; #else if(support_avx()) return &gotoblas_HASWELL; From 89372e0993b7d9fe9061797625713519392fa42b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 22:15:09 +0000 Subject: [PATCH 23/86] Use AVX512 also for DGEMM this required switching to the generic gemm_beta code (which is faster anyway on SKX) for both DGEMM and SGEMM Performance for the not-retuned version is in the 30% range --- kernel/x86_64/KERNEL.SKYLAKEX | 15 + kernel/x86_64/dgemm_kernel_16x2_skylakex.S | 5138 ++++++++++++++++++++ kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 3 +- 3 files changed, 5154 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.S diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 744831d67..c273ff8cd 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -2,3 +2,18 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + +DTRMMKERNEL = ../generic/trmmkernel_16x2.c +DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c \ No newline at end of file diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.S b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S new file mode 100644 index 000000000..91ac51280 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S @@ -0,0 +1,5138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 1024 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + vbroadcastsd -12 * SIZE(BO), %zmm1 + vbroadcastsd -11 * SIZE(BO), %zmm2 + vbroadcastsd -10 * SIZE(BO), %zmm3 + + vmovaps -16 * SIZE(AO), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + + vmovaps -8 * SIZE(AO), %zmm9 + VFMADD231PD_ %zmm10,%zmm1,%zmm9 + VFMADD231PD_ %zmm11,%zmm2,%zmm9 + VFMADD231PD_ %zmm12,%zmm3,%zmm9 + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %zmm2 +.endm + + + + +.macro KERNEL16x3_2 + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_3 + vmovups 0 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_4 + vmovups 16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + addq $12, BI + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $64, %rax +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %zmm0 + + vmulpd %zmm0 , %zmm4 , %zmm4 + vmulpd %zmm0 , %zmm10, %zmm10 + + vmulpd %zmm0 , %zmm5 , %zmm5 + vmulpd %zmm0 , %zmm11, %zmm11 + + vmulpd %zmm0 , %zmm6 , %zmm6 + vmulpd %zmm0 , %zmm12, %zmm12 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %zmm4,%zmm4 + vaddpd 8 * SIZE(CO1), %zmm10,%zmm10 + + vaddpd (CO1, LDC), %zmm5,%zmm5 + vaddpd 8 * SIZE(CO1, LDC), %zmm11,%zmm11 + + vaddpd (CO1, LDC, 2), %zmm6,%zmm6 + vaddpd 8 * SIZE(CO1, LDC, 2), %zmm12,%zmm12 + +#endif + + vmovups %zmm4 , (CO1) + vmovups %zmm10, 8 * SIZE(CO1) + + vmovups %zmm5 , (CO1, LDC) + vmovups %zmm11, 8 * SIZE(CO1, LDC) + + vmovups %zmm6 , (CO1, LDC, 2) + vmovups %zmm12, 8 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 1, BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S index 1fab892ca..ac4421252 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) +# prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) @@ -183,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) From ac7b6e3e9aeffe111a0ef23ba74ac2b181b87e30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 08:23:40 +0200 Subject: [PATCH 24/86] Fix misplaced endif --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2c902d108..ac1186c8f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -302,8 +302,8 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } +#endif } -#endif //Intel Skylake if (model == 14) { if(support_avx()) From 8be027e4c62460f373980e883c487a30a15b5a5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 14:36:39 +0200 Subject: [PATCH 25/86] Update dynamic.c --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ac1186c8f..96612cc52 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -82,7 +82,7 @@ extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; #else -#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#define gotoblas_SKYLAKEX gotoblas_HASWELL #endif #endif #else From dc9fe05ab5845452d684746bb7b7b7ad400c0c31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 17:10:19 +0200 Subject: [PATCH 26/86] Update cpuid_x86.c --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index d0dbe1d24..fc937865c 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1883,7 +1883,7 @@ int get_coretype(void){ #ifndef NO_AVX512 return CORE_SKYLAKEX; #else - if/support_avx()) + if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else From b7feded85acaf95d68ed4cfd573e60c83fdbca5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:24:05 +0200 Subject: [PATCH 27/86] Propagate NO_AVX512 via CCOMMON_OPT --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 82e38a6d2..8c875d6f7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -939,6 +939,10 @@ ifeq ($(NO_AVX2), 1) CCOMMON_OPT += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +CCOMMON_OPT += -DNO_AVX512 +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER From 38ad05bd0484ea723a42415f986cf0db24e01ca8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:26:49 +0200 Subject: [PATCH 28/86] Extend loop range to find SkylakeX in force_coretype --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 96612cc52..acb2d8b8c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -506,7 +506,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 23; i++) + for ( i=1 ; i <= 24; i++) { if (!strncasecmp(coretype,corename[i],20)) { From 354a976a59f1280c5403b8de37587baf53527b67 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:31:34 +0200 Subject: [PATCH 29/86] Fix inverted condition in _Atomic declaration fixes #1593 --- common.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common.h b/common.h index 123e3dee7..980099ee3 100644 --- a/common.h +++ b/common.h @@ -642,6 +642,7 @@ void gotoblas_profile_init(void); void gotoblas_profile_quit(void); #ifdef USE_OPENMP + #ifndef C_MSVC int omp_in_parallel(void); int omp_get_num_procs(void); @@ -649,12 +650,15 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif + #if (__STDC_VERSION__ >= 201112L) +#include +#else #ifndef _Atomic #define _Atomic volatile #endif -#include #endif + #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); From 15a78d6b662569a464de9a00517897b036fe7886 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 15:58:34 +0200 Subject: [PATCH 30/86] export NO_AVX512 setting --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 8c875d6f7..eaf3e9889 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1249,6 +1249,7 @@ export MSA_FLAGS export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE +export NO_AVX512 export SGEMM_UNROLL_M export SGEMM_UNROLL_N From e8002536ec90b74148abce1c3de9bca0061dbe32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 18:23:01 +0200 Subject: [PATCH 31/86] disable quiet_make for the moment --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index eaf3e9889..5c16e2bee 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -ifeq ($(QUIET_MAKE), 1) -MAKE += -s -endif +#ifeq ($(QUIET_MAKE), 1) +#MAKE += -s +#endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 From f6021c798dea23685af3eedcb63c4a388c78f226 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 19:09:38 +0200 Subject: [PATCH 32/86] Re-enable QUIET_MAKE --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 5c16e2bee..eaf3e9889 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -#ifeq ($(QUIET_MAKE), 1) -#MAKE += -s -#endif +ifeq ($(QUIET_MAKE), 1) +MAKE += -s +endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 From 7fb62aed7e2a08fb8fc62054a164d3479511ce82 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 23:29:33 +0200 Subject: [PATCH 33/86] Check build system support for AVX512 instructions --- cmake/system_check.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d47c38cdd..f054852bf 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -66,3 +66,12 @@ else() set(BINARY32 1) endif() +if (X86_64 OR X86) + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }") +execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) +if (NO_AVX512 EQUAL 1) +set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +endif() + file(REMOVE "avx512.tmp" "avx512.o") +endif() + From 06d43760e4ca2cc7007e54d88938eff9e95e0579 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 09:18:10 +0200 Subject: [PATCH 34/86] Restore _Atomic define before stdatomic.h for old gcc see #1593 --- common.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/common.h b/common.h index 123e3dee7..ecf07316d 100644 --- a/common.h +++ b/common.h @@ -649,12 +649,21 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif + #if (__STDC_VERSION__ >= 201112L) +#if defined(C_GCC) && ( __GNUC__ < 7) +// workaround for GCC bug 65467 #ifndef _Atomic #define _Atomic volatile #endif -#include #endif +#include +#else +#ifndef _Atomic +#define _Atomic volatile +#endif + + #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); From 83da278093e32f1e089a12d880c7ec65dfbb1457 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 09:27:49 +0200 Subject: [PATCH 35/86] Update common.h --- common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common.h b/common.h index cd1c4c0d1..663f37e7b 100644 --- a/common.h +++ b/common.h @@ -663,6 +663,7 @@ __declspec(dllimport) int __cdecl omp_get_num_procs(void); #ifndef _Atomic #define _Atomic volatile #endif +#endif #else #ifdef __ELF__ From 9b87b642624b398ebacee525edbc879cf3f950ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 16:49:00 +0200 Subject: [PATCH 36/86] Improve AVX512 testcase clang 3.4 managed to accept the original test code, only to fail on the actual Skylake asm later --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index dfe99350a..cc64c16c6 100644 --- a/c_check +++ b/c_check @@ -203,8 +203,8 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { - $code = '"vaddps %zmm1, %zmm0, %zmm0"'; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; + print $tmpf "int main(void){ __asm__ volatile($code); }\n"; $args = " -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args"); system(@cmd) == 0; From e4718b1fee0f8dcd0c892063d619477bd5ed31ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 16:51:30 +0200 Subject: [PATCH 37/86] Better AVX512 test case --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index f054852bf..a565fc0d5 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -67,7 +67,7 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }") + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") From ed7c4a043b3093dfe8ddb3d6d3e3d6fd6af43d4a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Jun 2018 10:18:26 +0200 Subject: [PATCH 38/86] Use usleep instead of sched_yield by default sched_yield only burns cpu cycles, fixes #900, see also #923, #1560 --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 663f37e7b..b7181e670 100644 --- a/common.h +++ b/common.h @@ -356,7 +356,7 @@ typedef int blasint; */ #ifndef YIELDING -#define YIELDING sched_yield() +#define YIELDING usleep(10) #endif /*** From e8880c1699816483090aa5574cf9b3322943831f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Jun 2018 10:26:55 +0200 Subject: [PATCH 39/86] Use a single thread for small input size copies daxpy improvement from #27, see #1560 --- interface/zaxpy.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/interface/zaxpy.c b/interface/zaxpy.c index fbb830ffb..529e78e79 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -41,7 +41,11 @@ #ifdef FUNCTION_PROFILE #include "functable.h" #endif - +#if defined(Z13) +#define MULTI_THREAD_MINIMAL 200000 +#else +#define MULTI_THREAD_MINIMAL 10000 +#endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in #endif #ifndef CBLAS - PRINT_DEBUG_CNAME; + PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif @@ -93,6 +97,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if (incx == 0 || incy == 0) nthreads = 1; + //Work around the low performance issue with small imput size & + //multithreads. + if (n <= MULTI_THREAD_MINIMAL) { + nthreads = 1; + } if (nthreads == 1) { #endif From 66316b9f4c8c7c48eed8b29e86f64581c02d45b0 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 7 Jun 2018 14:54:42 +0100 Subject: [PATCH 40/86] Improve performance of GEMM for small matrices when SMP is defined. Always checking num_cpu_avail() regardless of whether threading will actually be used adds noticeable overhead for small matrices. Most other uses of num_cpu_avail() do so only if threading will be used, so do the same here. --- interface/gemm.c | 27 ++++++--------------------- interface/trsm.c | 3 ++- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 8baf3fbec..a3bac5984 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -44,6 +44,7 @@ #endif #ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) @@ -52,6 +53,7 @@ #define ERROR_NAME "SGEMM " #endif #else +#define SMP_THRESHOLD_MIN 8192.0 #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " @@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB, FLOAT *sa, *sb; #ifdef SMP - int nthreads_max; - int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE @@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *sa, *sb; #ifdef SMP - int nthreads_max; - int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE @@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); - nthreads_max = num_cpu_avail(3); - nthreads_avail = nthreads_max; - -#ifndef COMPLEX MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; -#else - MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; -#endif - args.common = NULL; - - if ( nthreads_max > nthreads_avail ) - args.nthreads = nthreads_avail; + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) + args.nthreads = 1; else - args.nthreads = nthreads_max; - + args.nthreads = num_cpu_avail(3); + args.common = NULL; if (args.nthreads == 1) { #endif diff --git a/interface/trsm.c b/interface/trsm.c index 60c49795d..5c2750e79 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - args.nthreads = num_cpu_avail(3); if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; + else + args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { From 6c2d90ba7724b05e7fb97c7ec33324499e4a1a79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:29:17 +0200 Subject: [PATCH 41/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- CMakeLists.txt | 1 + Makefile | 3 +++ Makefile.install | 2 +- Makefile.rule | 5 +++++ Makefile.system | 17 ++++++++++++++++- 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49f20513..66c3d8afa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON endif() option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) +option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) ####### if(BUILD_WITHOUT_LAPACK) diff --git a/Makefile b/Makefile index 380ba1ce8..56b4426f8 100644 --- a/Makefile +++ b/Makefile @@ -153,6 +153,9 @@ ifeq ($(DYNAMIC_ARCH), 1) do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ done @echo DYNAMIC_ARCH=1 >> Makefile.conf_last +ifeq ($(DYNAMIC_OLDER), 1) + @echo DYNAMIC_OLDER=1 >> Makefile.conf_last +endif endif ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last diff --git a/Makefile.install b/Makefile.install index 21c3c9e22..c51c8a021 100644 --- a/Makefile.install +++ b/Makefile.install @@ -98,7 +98,7 @@ endif @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" diff --git a/Makefile.rule b/Makefile.rule index 1b4b8eb63..5c03d0195 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -17,6 +17,11 @@ VERSION = 0.3.1.dev # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 +# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH +# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON, +# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures) +# DYNAMIC_OLDER = 1 + # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. # CC = gcc diff --git a/Makefile.system b/Makefile.system index eaf3e9889..62ba0e466 100644 --- a/Makefile.system +++ b/Makefile.system @@ -472,7 +472,18 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += PENRYN DUNNINGTON +endif +DYNAMIC_CORE += NEHALEM +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += OPTERON OPTERON_SSE3 +endif +DYNAMIC_CORE += BARCELONA +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += BOBCAT ATOM NANO +endif ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif @@ -917,6 +928,10 @@ ifeq ($(DYNAMIC_ARCH), 1) CCOMMON_OPT += -DDYNAMIC_ARCH endif +ifeq ($(DYNAMIC_OLDER), 1) +CCOMMON_OPT += -DDYNAMIC_OLDER +endif + ifeq ($(NO_LAPACK), 1) CCOMMON_OPT += -DNO_LAPACK #Disable LAPACK C interface From 1cbd8f3ae47ffb89523fa247e81ffea07c6505a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:30:46 +0200 Subject: [PATCH 42/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- cmake/arch.cmake | 13 ++++++++++++- cmake/openblas.pc.in | 2 +- cmake/system.cmake | 3 +++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 527d2bec6..52fb64eaa 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -49,7 +49,18 @@ if (DYNAMIC_ARCH) endif () if (X86_64) - set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) + set(DYNAMIC_CORE PRESCOTT CORE2) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON) + endif () + set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3) + endif () + set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO) + endif () if (NOT NO_AVX) set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) endif () diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 35973b09b..ca88a6d5f 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,7 +1,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ -openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ +openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ diff --git a/cmake/system.cmake b/cmake/system.cmake index c21fe7c14..48e8f75bc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -163,6 +163,9 @@ endif () if (DYNAMIC_ARCH) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () endif () if (NO_LAPACK) From 63f7395fb49091295463785f6c1056f61dd64a7d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:31:38 +0200 Subject: [PATCH 43/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- driver/others/dynamic.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index acb2d8b8c..4271c0a0d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -56,16 +56,27 @@ EXTERN gotoblas_t gotoblas_BANIAS; EXTERN gotoblas_t gotoblas_ATHLON; extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_BARCELONA; +#ifdef DYNAMIC_OLDER extern gotoblas_t gotoblas_ATOM; extern gotoblas_t gotoblas_NANO; -extern gotoblas_t gotoblas_CORE2; extern gotoblas_t gotoblas_PENRYN; extern gotoblas_t gotoblas_DUNNINGTON; -extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; -extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_ATOM gotoblas_NEHALEM +#define gotoblas_NANO gotoblas_NEHALEM +#define gotoblas_PENRYN gotoblas_CORE2 +#define gotoblas_DUNNINGTON gotoblas_CORE2 +#define gotoblas_OPTERON gotoblas_CORE2 +#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 +#define gotoblas_BOBCAT gotoblas_CORE2 +#endif + #ifndef NO_AVX extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; From e9cd11768c20707eff31912db1bafc837c0224d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 17:54:36 +0200 Subject: [PATCH 44/86] Enable parallel make on MS Windows by default fixes #874 --- getarch.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/getarch.c b/getarch.c index fcffe63e2..31f41d62c 100644 --- a/getarch.c +++ b/getarch.c @@ -1196,9 +1196,7 @@ int main(int argc, char *argv[]){ #elif NO_PARALLEL_MAKE==1 printf("MAKE += -j 1\n"); #else -#ifndef OS_WINDOWS printf("MAKE += -j %d\n", get_num_cores()); -#endif #endif break; From 0bea6bb9e7e2468bc9d42f5ffdf27f772f2984af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 09:24:37 +0200 Subject: [PATCH 45/86] Create OpenBLASConfig.cmake from cmake as well --- CMakeLists.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49f20513..e1c308910 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON endif() option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) +option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) ####### if(BUILD_WITHOUT_LAPACK) @@ -208,6 +209,7 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} + EXPORT "OpenBLASTargets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) @@ -267,3 +269,21 @@ if(PKG_CONFIG_FOUND) configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() + + +# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". +set(PN OpenBLAS) +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +configure_package_config_file(cmake/${PN}Config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + VERSION ${${PN}_VERSION} + COMPATIBILITY AnyNewerVersion) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(EXPORT "${PN}Targets" + NAMESPACE "${PN}::" + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) + From 02634b549b678dc38c85ce4c77ebb532e8d9e471 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 09:25:46 +0200 Subject: [PATCH 46/86] Add template for OpenBLASConfig.cmake --- cmake/OpenBLASConfig.cmake.in | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 cmake/OpenBLASConfig.cmake.in diff --git a/cmake/OpenBLASConfig.cmake.in b/cmake/OpenBLASConfig.cmake.in new file mode 100644 index 000000000..87a1621b4 --- /dev/null +++ b/cmake/OpenBLASConfig.cmake.in @@ -0,0 +1,79 @@ +# OpenBLASConfig.cmake +# -------------------- +# +# OpenBLAS cmake module. +# This module sets the following variables in your project:: +# +# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system +# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release +# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located. +# OpenBLAS_INCLUDE_DIR - same as DIRS +# OpenBLAS_LIBRARIES - OpenBLAS library to link against. +# OpenBLAS_LIBRARY - same as LIBRARIES +# +# +# Available components:: +# +## shared - search for only shared library +## static - search for only static library +# serial - search for unthreaded library +# pthread - search for native pthread threaded library +# openmp - search for OpenMP threaded library +# +# +# Exported targets:: +# +# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED` +## target. Target is shared _or_ static, so, for both, use separate, not +## overlapping, installations. :: +# +# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached. +# +# +# Suggested usage:: +# +# find_package(OpenBLAS) +# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread) +# +# +# The following variables can be set to guide the search for this package:: +# +# OpenBLAS_DIR - CMake variable, set to directory containing this Config file +# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package +# PATH - environment variable, set to bin directory of this package +# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables +# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build + +@PACKAGE_INIT@ + +set(PN OpenBLAS) + +# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon +if(@USE_OPENMP@) + set(${PN}_openmp_FOUND 1) +elseif(@USE_THREAD@) + set(${PN}_pthread_FOUND 1) +else() + set(${PN}_serial_FOUND 1) +endif() + +check_required_components(${PN}) + +#----------------------------------------------------------------------------- +# Don't include targets if this file is being picked up by another +# project which has already built this as a subproject +#----------------------------------------------------------------------------- +if(NOT TARGET ${PN}::OpenBLAS) + include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake") + + get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION) + set(${PN}_LIBRARY ${_loc}) + get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES) + set(${PN}_LIBRARIES ${_ill}) + + get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES) + set(${PN}_INCLUDE_DIR ${_id}) + get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + set(${PN}_INCLUDE_DIRS ${_iid}) +endif() + From e65f451409e2150bf299a2cdd906bec4ffff7915 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 15:09:43 +0200 Subject: [PATCH 47/86] include CMakePackageConfigHelpers --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e1c308910..a2421ac54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,9 @@ set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${Open # Adhere to GNU filesystem layout conventions include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + + set(OpenBLAS_LIBNAME openblas) ####### From c2545b0fd6978e1fb09c2dc86b825846e0034228 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Mon, 11 Jun 2018 10:13:09 +0100 Subject: [PATCH 48/86] Fixed a few more unnecessary calls to num_cpu_avail. I don't have as many benchmarks for these as for gemm, but it should still make a difference for small matrices. --- interface/axpy.c | 14 ++++++-------- interface/scal.c | 5 +++-- interface/zaxpy.c | 14 ++++++-------- interface/zscal.c | 4 ++-- interface/zswap.c | 4 ++-- kernel/arm64/casum_thunderx2t99.c | 9 +++------ kernel/arm64/copy_thunderx2t99.c | 9 +++------ kernel/arm64/dasum_thunderx2t99.c | 9 +++------ kernel/arm64/dot_thunderx2t99.c | 11 ++++------- kernel/arm64/dznrm2_thunderx2t99.c | 4 ++-- kernel/arm64/dznrm2_thunderx2t99_fast.c | 4 ++-- kernel/arm64/iamax_thunderx2t99.c | 9 +++------ kernel/arm64/izamax_thunderx2t99.c | 9 +++------ kernel/arm64/sasum_thunderx2t99.c | 9 +++------ kernel/arm64/scnrm2_thunderx2t99.c | 4 ++-- kernel/arm64/zasum_thunderx2t99.c | 9 +++------ kernel/arm64/zdot_thunderx2t99.c | 9 +++------ kernel/x86_64/ddot.c | 15 ++++++--------- 18 files changed, 59 insertions(+), 92 deletions(-) diff --git a/interface/axpy.c b/interface/axpy.c index f0d95b395..39edea6af 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -40,11 +40,11 @@ #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" -#endif +#endif #if defined(Z13) #define MULTI_THREAD_MINIMAL 200000 #else -#define MULTI_THREAD_MINIMAL 10000 +#define MULTI_THREAD_MINIMAL 10000 #endif #ifndef CBLAS @@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc if (incy < 0) y -= (n - 1) * incy; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - + // //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/scal.c b/interface/scal.c index 3f468a2a3..6d07b1650 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #ifdef SMP - nthreads = num_cpu_avail(1); - if (n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 529e78e79..1a0259c96 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - - //Work around the low performance issue with small imput size & + // + //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) { + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; - } + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zscal.c b/interface/zscal.c index 633b6ecf5..bfaddc260 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ FUNCTION_PROFILE_START(); #ifdef SMP - nthreads = num_cpu_avail(1); - if ( n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/zswap.c b/interface/zswap.c index 5308cbe90..e33bbafba 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/kernel/arm64/casum_thunderx2t99.c b/kernel/arm64/casum_thunderx2t99.c index cd5d936c5..c6dbb3f77 100644 --- a/kernel/arm64/casum_thunderx2t99.c +++ b/kernel/arm64/casum_thunderx2t99.c @@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = casum_compute(n, x, inc_x); diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c index bd67b48b0..e31876139 100644 --- a/kernel/arm64/copy_thunderx2t99.c +++ b/kernel/arm64/copy_thunderx2t99.c @@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if (n <= 0) return 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { do_copy(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c index ba12fc776..a212c9534 100644 --- a/kernel/arm64/dasum_thunderx2t99.c +++ b/kernel/arm64/dasum_thunderx2t99.c @@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = dasum_compute(n, x, inc_x); diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c index 8eeb94f36..3940acddd 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_thunderx2t99.c @@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " faddp "DOTF", v0.2d \n" #endif /* !defined(DSDOT) */ -#else /* !defined(DOUBLE) */ +#else /* !defined(DOUBLE) */ #define KERNEL_F1 \ " ldr "TMPX", ["X"] \n" \ " ldr "TMPY", ["Y"] \n" \ @@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y RETURN_TYPE dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index 2aea9b4a9..b94f0cffc 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_compute(n, x, inc_x, &ssq, &scale); diff --git a/kernel/arm64/dznrm2_thunderx2t99_fast.c b/kernel/arm64/dznrm2_thunderx2t99_fast.c index 8b04a3eb6..8405b388b 100644 --- a/kernel/arm64/dznrm2_thunderx2t99_fast.c +++ b/kernel/arm64/dznrm2_thunderx2t99_fast.c @@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2 = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c index a11b18419..e3bec4a20 100644 --- a/kernel/arm64/iamax_thunderx2t99.c +++ b/kernel/arm64/iamax_thunderx2t99.c @@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = iamax_compute(n, x, inc_x); diff --git a/kernel/arm64/izamax_thunderx2t99.c b/kernel/arm64/izamax_thunderx2t99.c index 8d70b0515..b2e2828f0 100644 --- a/kernel/arm64/izamax_thunderx2t99.c +++ b/kernel/arm64/izamax_thunderx2t99.c @@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = izamax_compute(n, x, inc_x); diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c index 28fc34c62..014c667ba 100644 --- a/kernel/arm64/sasum_thunderx2t99.c +++ b/kernel/arm64/sasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = sasum_compute(n, x, inc_x); diff --git a/kernel/arm64/scnrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c index b8df4962b..f96de441e 100644 --- a/kernel/arm64/scnrm2_thunderx2t99.c +++ b/kernel/arm64/scnrm2_thunderx2t99.c @@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_double = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c index 140e5a741..1d303a9a3 100644 --- a/kernel/arm64/zasum_thunderx2t99.c +++ b/kernel/arm64/zasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = zasum_compute(n, x, inc_x); diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 70d683077..6185bc7d9 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA CIMAG(zdot) = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { zdot_compute(n, x, inc_x, y, inc_y, &zdot); diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 059549028..0dc9cd3da 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "ddot_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "ddot_microk_piledriver-2.c" -#elif defined(NEHALEM) +#elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" @@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; - BLASLONG n1 = n & -4; + BLASLONG n1 = n & -4; while(i < n1) { @@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); From 6f71c0fce45c86c55d12b6e12e69b9ccb8ec2f28 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Jun 2018 13:26:19 +0200 Subject: [PATCH 49/86] =?UTF-8?q?Return=20a=20somewhat=20sane=20default=20?= =?UTF-8?q?value=20for=20L2=20cache=20size=20if=20cpuid=20retur=E2=80=A6?= =?UTF-8?q?=20(#1611)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Return a somewhat sane default value for L2 cache size if cpuid returned something unexpected Fixes #1610, the KVM hypervisor on Google Chromebooks returning zero for CPUID 0x80000006, causing DYNAMIC_ARCH builds of OpenBLAS to hang --- kernel/setparam-ref.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 9030d7c6d..f654de110 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -647,7 +647,9 @@ static int get_l2_size_old(void){ return 6144; } } - return 0; +// return 0; +fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); +return 256; } #endif @@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){ l2 = BITMASK(ecx, 16, 0xffff); #ifndef ARCH_X86 + if (l2 <= 0) { + fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); + return 256; + } return l2; #else From de8fff671d6081bf543b55c95655fe5f6b5e4007 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Jun 2018 17:05:27 +0200 Subject: [PATCH 50/86] Revert "Use usleep instead of sched_yield by default" --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index b7181e670..663f37e7b 100644 --- a/common.h +++ b/common.h @@ -356,7 +356,7 @@ typedef int blasint; */ #ifndef YIELDING -#define YIELDING usleep(10) +#define YIELDING sched_yield() #endif /*** From fcb77ab129821690fac4e532640c5cfa786c3a79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Jun 2018 16:57:58 +0200 Subject: [PATCH 51/86] Update OSX deployment target to 10.8 fixes #1580 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 62ba0e466..5dffd8d2e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -248,7 +248,7 @@ endif ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET -export MACOSX_DEPLOYMENT_TARGET=10.6 +export MACOSX_DEPLOYMENT_TARGET=10.8 endif MD5SUM = md5 -r endif From bf40f806efa55c7a7c7ec57535919598eaeb569d Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 14 Jun 2018 12:18:04 +0100 Subject: [PATCH 52/86] Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something. --- driver/others/memory.c | 199 +++++++++-------------------------------- 1 file changed, 43 insertions(+), 156 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index d69e52e97..85f790615 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif +#ifndef BUFFERS_PER_THREAD +#ifdef USE_OPENMP +#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#else +#define BUFFERS_PER_THREAD NUM_BUFFERS +#endif +#endif + #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -213,7 +221,7 @@ int i,n; ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -415,8 +423,15 @@ struct release_t { int hugetlb_allocated = 0; -static struct release_t release_info[NUM_BUFFERS]; -static int release_pos = 0; +#if defined(OS_WINDOWS) +#define THREAD_LOCAL __declspec(thread) +#define UNLIKELY_TO_BE_ZERO(x) (x) +#else +#define THREAD_LOCAL __thread +#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0)) +#endif +static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD]; +static int THREAD_LOCAL release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } #ifdef OS_LINUX @@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){ #endif if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } return map_address; @@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; @@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -static volatile struct { - BLASULONG lock; +struct memory_t { void *addr; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int pos; -#endif int used; #ifndef __64BIT__ char dummy[48]; #else char dummy[40]; #endif +}; -} memory[NUM_BUFFERS]; +static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD]; static int memory_initialized = 0; @@ -987,9 +987,6 @@ static int memory_initialized = 0; void *blas_memory_alloc(int procpos){ int position; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; -#endif void *map_address; @@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){ }; void *(**func)(void *address); -#if defined(USE_OPENMP) - if (!memory_initialized) { -#endif + if (UNLIKELY_TO_BE_ZERO(memory_initialized)) { - LOCK_COMMAND(&alloc_lock); + /* Only allow a single thread to initialize memory system */ + LOCK_COMMAND(&alloc_lock); - if (!memory_initialized) { - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - for (position = 0; position < NUM_BUFFERS; position ++){ - memory[position].addr = (void *)0; - memory[position].pos = -1; - memory[position].used = 0; - memory[position].lock = 0; - } -#endif + if (!memory_initialized) { #ifdef DYNAMIC_ARCH - gotoblas_dynamic_init(); + gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) - gotoblas_affinity_init(); + gotoblas_affinity_init(); #endif #ifdef SMP - if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #ifndef DYNAMIC_ARCH - blas_set_parameter(); + blas_set_parameter(); #endif #endif - memory_initialized = 1; + memory_initialized = 1; + } + UNLOCK_COMMAND(&alloc_lock); } - UNLOCK_COMMAND(&alloc_lock); -#if defined(USE_OPENMP) - } -#endif #ifdef DEBUG printf("Alloc Start ...\n"); -#endif - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - mypos = WhereAmI(); - - position = mypos; - while (position >= NUM_BUFFERS) position >>= 1; - - do { - if (!memory[position].used && (memory[position].pos == mypos)) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - blas_lock(&memory[position].lock); -#endif - if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif - } - - position ++; - - } while (position < NUM_BUFFERS); - - #endif position = 0; do { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - if (!memory[position].used) { - blas_lock(&memory[position].lock); -#endif if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); - } -#endif - position ++; - } while (position < NUM_BUFFERS); + } while (position < BUFFERS_PER_THREAD); goto error; @@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif if (!memory[position].addr) { do { @@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); } #endif #ifdef ALLOC_HUGETLBFILE if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); #endif } #endif @@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - if (memory[position].pos == -1) memory[position].pos = mypos; - -#endif - -#ifdef DYNAMIC_ARCH - - if (memory_initialized == 1) { - - LOCK_COMMAND(&alloc_lock); - - if (memory_initialized == 1) { - - if (!gotoblas) gotoblas_dynamic_init(); - - memory_initialized = 2; - } - - UNLOCK_COMMAND(&alloc_lock); - - } -#endif - - #ifdef DEBUG printf("Mapped : %p %3d\n\n", (void *)memory[position].addr, position); @@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); return NULL; } @@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area)) position++; if (memory[position].addr != free_area) goto error; @@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif - // arm: ensure all writes are finished before other thread takes this memory - WMB; - memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){ printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); #ifdef DEBUG - for (position = 0; position < NUM_BUFFERS; position++) + for (position = 0; position < BUFFERS_PER_THREAD; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); -#endif -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); #endif return; } @@ -1293,8 +1188,6 @@ void blas_shutdown(void){ BLASFUNC(blas_thread_shutdown)(); #endif - LOCK_COMMAND(&alloc_lock); - for (pos = 0; pos < release_pos; pos ++) { release_info[pos].func(&release_info[pos]); } @@ -1305,17 +1198,11 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < NUM_BUFFERS; pos ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ memory[pos].addr = (void *)0; memory[pos].used = 0; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - memory[pos].pos = -1; -#endif - memory[pos].lock = 0; } - UNLOCK_COMMAND(&alloc_lock); - return; } From 47bf0dba8f7a9cbd559e2f9cabe0bf2c7d3ee7a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Jun 2018 11:25:05 +0200 Subject: [PATCH 53/86] Add build-time option for OMP scheduler; document MULTITHREAD_THRESHOLD range (#1620) * Allow choosing the OpenMP scheduler and add range hint for GEMM_MULTITHREAD_THRESHOLD * Amended description of GEMM_MULTITHREAD_THRESHOLD to reflect #742 making it track floating point operations rather than matrix size --- Makefile.rule | 15 +++++++++++++-- driver/others/blas_server_omp.c | 6 +++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 5c03d0195..649aabe70 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev # This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 +# The OpenMP scheduler to use - by default this is "static" and you +# will normally not want to change this unless you know that your main +# workload will involve tasks that have highly unbalanced running times +# for individual threads. Changing away from "static" may also adversely +# affect memory access locality in NUMA systems. Setting to "runtime" will +# allow you to select the scheduler from the environment variable OMP_SCHEDULE +# CCOMMON_OPT += -DOMP_SCHED=dynamic + # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. @@ -156,8 +164,11 @@ NO_AFFINITY = 1 # CONSISTENT_FPCSR = 1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute -# with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. +# with single thread. (Actually in recent versions this is a factor proportional to the +# number of floating point operations necessary for the given problem size, no longer +# an individual dimension). You can use this setting to avoid the overhead of multi- +# threading in small matrix sizes. The default value is 4, but values as high as 50 have +# been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 # If you need santy check by comparing reference BLAS. It'll be very diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index fccdb4320..4255852c8 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -48,6 +48,10 @@ #else +#ifndef OMP_SCHED +#define OMP_SCHED static +#endif + int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 From 9e162146a93a58a06515bc53f07e37b8924e0d67 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:32:03 +0000 Subject: [PATCH 54/86] Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow) --- driver/level3/level3_thread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 4ab1ee8cc..018813b8c 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -658,8 +658,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG } /* Clear synchronization flags */ - for (i = 0; i < MAX_CPU_NUMBER; i++) { - for (j = 0; j < MAX_CPU_NUMBER; j++) { + for (i = 0; i < nthreads; i++) { + for (j = 0; j < nthreads; j++) { for (k = 0; k < DIVIDE_RATE; k++) { job[i].working[j][CACHE_LINE_SIZE * k] = 0; } From d148ec4ea18e672dacb1270d4a5308ccaaae18bc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:39:15 +0000 Subject: [PATCH 55/86] Don't use _Atomic for jobs sometimes... The use of _Atomic leads to really bad code generation in the compiler (on x86, you get 2 "mfence" memory barriers around each access with gcc8, despite x86 being ordered and cache coherent). But there's a fallback in the code that just uses volatile which is more than plenty in practice. If we're nervous about cross thread synchronization for these variables, we should make the YIELD function be a compiler/memory barrier instead. performance before (after last commit) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% Performance with this patch (roughly a 2x improvement): Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10756.0 10.5 -0.5% 18296.7 6.1 -1.7% 64 x 64 20490.0 12.9 1.4% 40615.0 6.5 0.0% 65 x 65 83528.3 3.3 -210.9% 96319.0 2.9 -83.3% 80 x 80 101453.5 5.1 -166.3% 128021.7 4.0 -76.6% 96 x 96 149795.1 5.9 -143.1% 168059.4 5.3 -47.4% 112 x 112 191481.2 7.3 -105.8% 204165.0 6.9 -14.6% 128 x 128 265019.2 7.9 -99.0% 272006.4 7.7 -5.3% --- driver/level3/level3_thread.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 018813b8c..7e75f69d1 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,11 +91,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L -_Atomic -#else volatile -#endif BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; From 5c6f008365ee3c6d42f8630d27259f130a688468 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:47:50 +0000 Subject: [PATCH 56/86] Tune param.h for SkylakeX param.h defines a per-platform SWITCH_RATIO, which is used as a measure for how fine grained the blocks for gemm need to be split up. Many platforms define this to 4. The reality is that the gemm low level implementation for SkylakeX likes bigger blocks due to the nature of SIMD... by tuning the SWITCH_RATIO to 32 the threading performance improves significantly: Before Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10756.0 10.5 -0.5% 18296.7 6.1 -1.7% 64 x 64 20490.0 12.9 1.4% 40615.0 6.5 0.0% 65 x 65 83528.3 3.3 -210.9% 96319.0 2.9 -83.3% 80 x 80 101453.5 5.1 -166.3% 128021.7 4.0 -76.6% 96 x 96 149795.1 5.9 -143.1% 168059.4 5.3 -47.4% 112 x 112 191481.2 7.3 -105.8% 204165.0 6.9 -14.6% 128 x 128 265019.2 7.9 -99.0% 272006.4 7.7 -5.3% After Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10666.3 10.6 0.4% 18236.9 6.2 -1.4% 64 x 64 20410.1 13.0 1.8% 39925.8 6.6 1.7% 65 x 65 34983.0 7.9 -30.2% 51494.6 5.4 2.0% 80 x 80 39769.1 13.0 -4.4% 63805.2 8.1 12.0% 96 x 96 45169.6 19.7 26.7% 80065.8 11.1 29.8% 112 x 112 57026.1 24.7 38.7% 99535.5 14.2 44.1% 128 x 128 64789.8 32.5 51.3% 117407.2 17.9 54.6% With this change, threading starts to be a win already at 96x96 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 49a5e85e8..3573fffbb 100644 --- a/param.h +++ b/param.h @@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 From 6eb4b9ae7c7cc58af00ac21b52fed8810d7e5710 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:05:04 +0000 Subject: [PATCH 57/86] Tune HASWELL SWITCH_RATIO as well Similar to the SKYLAKEX patch, 32 seems to work best (much better than 4 or 16) Before (4) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15554.3 7.2 0.2% 30353.8 3.7 0.3% 64 x 64 30346.8 8.7 1.6% 63495.0 4.1 -0.1% 65 x 65 81668.1 3.4 -123.3% 82705.2 3.3 -21.2% 80 x 80 105045.9 4.9 -95.5% 115226.0 4.5 -2.2% 96 x 96 152461.2 5.8 -74.3% 148156.3 6.0 16.4% 112 x 112 188505.2 7.5 -42.2% 171187.3 8.2 36.4% 128 x 128 257884.0 8.1 -39.5% 224764.8 9.3 46.0% Intermediate (16) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15565.7 7.2 0.2% 30378.9 3.7 0.2% 64 x 64 30430.2 8.7 1.3% 63046.4 4.2 0.6% 65 x 65 27306.0 10.1 25.3% 38879.2 7.1 43.0% 80 x 80 51008.7 10.1 5.1% 61007.6 8.4 45.9% 96 x 96 70856.7 12.5 19.0% 83403.1 10.6 53.0% 112 x 112 84769.9 16.6 36.0% 99920.1 14.1 62.9% 128 x 128 84213.2 25.0 54.5% 113024.2 18.6 72.8% After (32) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15537.3 7.2 0.3% 30537.0 3.6 -0.3% 64 x 64 30352.7 8.7 1.6% 62597.8 4.2 1.3% 65 x 65 36857.0 7.5 -0.8% 56167.6 4.9 17.7% 80 x 80 42552.6 12.1 20.8% 69536.7 7.4 38.3% 96 x 96 52101.5 17.1 40.5% 91016.1 9.7 48.7% 112 x 112 63853.7 22.1 51.8% 110507.4 12.7 58.9% 128 x 128 73966.1 28.4 60.0% 163146.4 12.9 60.8% --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3573fffbb..cfa4bba5c 100644 --- a/param.h +++ b/param.h @@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 From 73de17664dfdf2934a2fdc6dd9442107e6c85035 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:50:43 +0000 Subject: [PATCH 58/86] Add missing barriers in gemm scheduler a few places in the gemm scheduler code were missing barriers; the code likely worked OK due to heavy use of volatile / _Atomic but there's no reason to get this incorrect --- driver/level3/level3_thread.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 7e75f69d1..aeb5e6ed4 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -347,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using workspace */ START_RPCC(); for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); #if defined(FUSED_GEMM) && !defined(TIMING) @@ -409,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until other region of B is initialized */ START_RPCC(); - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); /* Apply kernel with local region of A and part of other region of B */ @@ -427,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; } } } while (current != mypos); @@ -488,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (js = 0; js < DIVIDE_RATE; js++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; } } STOP_RPCC(waiting3); From 7e39ffe1135ee6ca1dc119f6eea9566668fd0916 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:53:15 +0000 Subject: [PATCH 59/86] On x86-64, make MB/WMB compiler barriers Whie on x86(64) one does not normally need full memory barriers, it's good practice to at least use compiler barriers for places where on other architectures memory barriers are used; this prevents the compiler from over-optimizing. --- common_x86_64.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index 7461aaf60..3236778b8 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -60,8 +60,13 @@ #endif */ +#ifdef __GNUC__ +#define MB __asm__ __volatile__("": : :"memory") +#define WMB __asm__ __volatile__("": : :"memory") +#else #define MB #define WMB +#endif static void __inline blas_lock(volatile BLASULONG *address){ From 2ddc96c9e5a86e3fd12954b3efc269f0cc8d07d8 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 18:06:24 +0000 Subject: [PATCH 60/86] make WMB / MB safer on x86-64 make it so that if (foo) RMB; else MB; is always done correctly and without syntax surprises --- common_x86_64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 3236778b8..62e138e34 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -61,11 +61,11 @@ */ #ifdef __GNUC__ -#define MB __asm__ __volatile__("": : :"memory") -#define WMB __asm__ __volatile__("": : :"memory") +#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) #else -#define MB -#define WMB +#define MB do {} while (0) +#define WMB do {} while (0) #endif static void __inline blas_lock(volatile BLASULONG *address){ From 2d8cc7193ace18c28ea05ef39e13bb28437b6d89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Jun 2018 23:38:14 +0200 Subject: [PATCH 61/86] Support upcoming Intel Cannon Lake CPUs as Skylake X (#1621) * Support upcoming Cannon Lake as Skylake X --- cpuid_x86.c | 17 +++++++++++++++++ driver/others/dynamic.c | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index fc937865c..89eb809b0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1339,6 +1339,23 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 6: + switch (model) { + case 6: // Cannon Lake +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif + } + break; case 9: case 8: switch (model) { diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4271c0a0d..bacd3b7fa 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } return NULL; + case 6: + if (model == 6) { + // Cannon Lake +#ifndef NO_AVX512 + return &gotoblas_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return &gotoblas_HASWELL; +#else + return &gotblas_SANDYBRIDGE; +#endif + else + return &gotoblas_NEHALEM; +#endif + } + return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake From 1f9e4f319327dd53d1243edb3a812c5a2366a938 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:46:36 +0200 Subject: [PATCH 62/86] Handle special case of gfortran+clang+OpenMP --- ctest/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ctest/Makefile b/ctest/Makefile index 6eda43863..569a5dda3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -102,7 +102,13 @@ clean :: rm -f x* FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) From 6a5ab083b7e78458861b197b8e98b2506345d6d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:47:33 +0200 Subject: [PATCH 63/86] Handle special case of gfortran+clang+OpenMP --- test/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/Makefile b/test/Makefile index 65fb6f438..074411b05 100644 --- a/test/Makefile +++ b/test/Makefile @@ -122,8 +122,13 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = - +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) From 10b70c904d9e3b610d35f1efe8d89888da4011bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:53:19 +0200 Subject: [PATCH 64/86] Handle erroneous user settings NOFORTRAN=0 and NO_FORTRAN --- Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile b/Makefile index 56b4426f8..728567f80 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif +ifeq ($(NOFORTRAN), 0) +undefine NOFORTRAN +endif + +ifeq ($(NO_FORTRAN), 1) +undefine NO_FORTRAN +NOFORTRAN=1 +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench From 9369d3e6e5207c6974af162e67d4060ed625c322 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 23:28:06 +0200 Subject: [PATCH 65/86] Modify NOFORTRAN tests to always check the value; fix rewriting of NO_FORTRAN --- Makefile | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 728567f80..4760be0be 100644 --- a/Makefile +++ b/Makefile @@ -21,13 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif -ifeq ($(NOFORTRAN), 0) -undefine NOFORTRAN -endif - ifeq ($(NO_FORTRAN), 1) -undefine NO_FORTRAN -NOFORTRAN=1 +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) @@ -56,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -117,7 +119,7 @@ endif endif tests : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -219,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -240,7 +242,7 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -283,21 +285,21 @@ endif endif large.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From 952541e840bddbcdcdfce81aefc09edf7fbfb84f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jun 2018 13:20:30 +0200 Subject: [PATCH 66/86] Need to use filter-out to handle NOFORTRAN not set --- Makefile | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 4760be0be..49dab6484 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +119,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,7 +242,10 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) + $(info filter value of NOFORTRAN is:) + $(info x$(filter-out $(NOFORTRAN), 1 2)x) + +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc From 0c5b7b400b3973d214ce24c566be4446743eacf7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jun 2018 15:16:19 +0200 Subject: [PATCH 67/86] Add -march=skylake-avx512 to flags if target is skylake x --- Makefile.x86_64 | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 1ba63278a..677c05d93 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,13 @@ endif endif endif +ifeq ($(CORE), SKYLAKEX) +ifndef NO_AVX512 +CCOMMON_OPT += -march=skylake-avx512 +FCOMMON_OPT += -march=skylake-avx512 +endif +endif + ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 endif From 05978528c3f3c61fb370e1fae0ac3013faaa595e Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Wed, 20 Jun 2018 17:03:18 +0100 Subject: [PATCH 68/86] Avoid declaring arrays of size 0 when making large stack allocations. --- common_stackalloc.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/common_stackalloc.h b/common_stackalloc.h index 71fb1a477..ec0fa1611 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - large enough to support all architectures and kernel * Chosing a too small SIZE will lead to a stack smashing. */ -#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ - /* make it volatile because some function (ex: dgemv_n.S) */ \ - /* do not restore all register */ \ - volatile int stack_alloc_size = SIZE; \ - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ - stack_alloc_size = 0; \ - STACK_ALLOC_PROTECT_SET \ - TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ +#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ + /* make it volatile because some function (ex: dgemv_n.S) */ \ + /* do not restore all register */ \ + volatile int stack_alloc_size = SIZE; \ + if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ + STACK_ALLOC_PROTECT_SET \ + /* Avoid declaring an array of length 0 */ \ + TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ + __attribute__((aligned(0x20))); \ BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); #else //Original OpenBLAS/GotoBLAS codes. From a399d004257b2f43e8211341f924f3a73171b98c Mon Sep 17 00:00:00 2001 From: oon3m0oo Date: Wed, 20 Jun 2018 21:04:03 +0100 Subject: [PATCH 69/86] Further improvements to memory.c. (#1625) - Compiler TLS is now used only used when the compiler supports it - If compiler TLS is unsupported, we use platform-specific TLS - Only one variable (an index) is now in TLS - We only access TLS once per alloc, and never when freeing - Allocation / release info is now stored within the allocation itself, by over-allocating; this saves having external structures do the bookkeeping, and reduces some of the redundant data that was being stored (such as addresses) - We never hit the alloc lock when not using SMP or when using OpenMP (that was my fault) - Now that there are fewer tracking structures I think this is a bit easier to read than before --- driver/others/memory.c | 399 +++++++++++++++++++++++++---------------- 1 file changed, 243 insertions(+), 156 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 85f790615..ed20cf5cd 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -326,6 +326,8 @@ int goto_get_num_procs (void) { return blas_cpu_number; } +static void blas_memory_init(); + void openblas_fork_handler() { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is @@ -337,7 +339,7 @@ void openblas_fork_handler() // implementation of OpenMP. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; - err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); + err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init); if(err != 0) openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); #endif @@ -415,23 +417,104 @@ int openblas_get_num_threads(void) { #endif } -struct release_t { - void *address; - void (*func)(struct release_t *); - long attr; -}; - int hugetlb_allocated = 0; #if defined(OS_WINDOWS) #define THREAD_LOCAL __declspec(thread) -#define UNLIKELY_TO_BE_ZERO(x) (x) +#define LIKELY_ONE(x) (x) #else #define THREAD_LOCAL __thread -#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0)) +#define LIKELY_ONE(x) (__builtin_expect(x, 1)) #endif -static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD]; -static int THREAD_LOCAL release_pos = 0; + +/* Stores information about the allocation and how to release it */ +struct alloc_t { + /* Whether this allocation is being used */ + int used; + /* Any special attributes needed when releasing this allocation */ + int attr; + /* Function that can properly release this memory */ + void (*release_func)(struct alloc_t *); + /* Pad to 64-byte alignment */ + char pad[64 - 2 * sizeof(int) - sizeof(void(*))]; +}; + +/* Convenience macros for storing release funcs */ +#define STORE_RELEASE_FUNC(address, func) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + } + +#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + alloc_info->attr = attr; \ + } + +/* The number of bytes that will be allocated for each buffer. When allocating + memory, we store an alloc_t followed by the actual buffer memory. This means + that each allocation always has its associated alloc_t, without the need + for an auxiliary tracking structure. */ +static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); + +/* Clang supports TLS from version 2.8 */ +#if defined(__clang__) && __clang_major__ > 2 || \ + (__clang_minor__ == 2 || __clang_minor__ == 8) +#define HAS_COMPILER_TLS +#endif + +/* GCC supports TLS from version 4.1 */ +#if !defined(__clang__) && defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) +#define HAS_COMPILER_TLS +#endif + +/* MSVC supports TLS from version 2005 */ +#if defined(_MSC_VER) && _MSC_VER >= 1400 +#define HAS_COMPILER_TLS +#endif + +/* Versions of XCode before 8 did not properly support TLS */ +#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 +#undef HAS_COMPILER_TLS +#endif + +/* Android NDK's before version 12b did not support TLS */ +#if defined(__ANDROID__) && defined(__clang__) +#if __has_include() +#include +#endif +#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ + defined(__NDK_MINOR__) && \ + ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) +#undef HAS_COMPILER_TLS +#endif +#endif + +/* Holds pointers to allocated memory */ +#if defined(SMP) && !defined(USE_OPENMP) +/* This is the number of threads than can be spawned by the server, which is the + server plus the number of threads in the thread pool */ +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +static int next_memory_table_pos = 0; +# if defined(HAS_COMPILER_TLS) +/* Use compiler generated thread-local-storage */ +static int THREAD_LOCAL local_memory_table_pos = 0; +# else +/* Use system-dependent thread-local-storage */ +# if defined(OS_WINDOWS) +static DWORD local_storage_key; +# else +static pthread_key_t local_storage_key; +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#else +/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ +# define MAX_ALLOCATING_THREADS 1 +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -447,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +/* Returns a pointer to the start of the per-thread memory allocation data */ +static __inline struct alloc_t ** get_memory_table() { +#if defined(SMP) && !defined(USE_OPENMP) +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); +# else + int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + if (!local_memory_table_pos) { + LOCK_COMMAND(&alloc_lock); + local_memory_table_pos = next_memory_table_pos++; + UNLOCK_COMMAND(&alloc_lock); + if (next_memory_table_pos > MAX_ALLOCATING_THREADS) + printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); +# else + pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + } + return local_memory_table[local_memory_table_pos]; +#else + return local_memory_table[0]; +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +} + #ifdef ALLOC_MMAP -static void alloc_mmap_free(struct release_t *release){ +static void alloc_mmap_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : munmap failed\n"); } } @@ -465,22 +578,18 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); } - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif return map_address; @@ -533,25 +642,25 @@ static void *alloc_mmap(void *address){ if (address){ /* Just give up use advanced operation */ - map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #if defined(OS_LINUX) && !defined(NO_WARMUP) if (hot_alloc == 0) { - map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #endif - map_address = mmap(NULL, BUFFER_SIZE * SCALING, + map_address = mmap(NULL, allocation_block_size * SCALING, MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { @@ -559,7 +668,7 @@ static void *alloc_mmap(void *address){ #ifdef OS_LINUX #ifdef DEBUG int ret=0; - ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; perror("OpenBLAS alloc_mmap:"); @@ -567,7 +676,7 @@ static void *alloc_mmap(void *address){ } #else - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif @@ -575,7 +684,7 @@ static void *alloc_mmap(void *address){ allocsize = DGEMM_P * DGEMM_Q * sizeof(double); start = (BLASULONG)map_address; - current = (SCALING - 1) * BUFFER_SIZE; + current = (SCALING - 1) * allocation_block_size; while(current > 0) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; @@ -590,7 +699,7 @@ static void *alloc_mmap(void *address){ best = (BLASULONG)-1; best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { current = run_bench(start, allocsize); @@ -606,7 +715,7 @@ static void *alloc_mmap(void *address){ if ((BLASULONG)best_address > (BLASULONG)map_address) munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); - munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address); map_address = best_address; @@ -619,11 +728,7 @@ static void *alloc_mmap(void *address){ } #endif - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); return map_address; } @@ -635,9 +740,9 @@ static void *alloc_mmap(void *address){ #ifdef ALLOC_MALLOC -static void alloc_malloc_free(struct release_t *release){ +static void alloc_malloc_free(struct alloc_t *alloc_info){ - free(release -> address); + free(alloc_info); } @@ -645,15 +750,11 @@ static void *alloc_malloc(void *address){ void *map_address; - map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_malloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_malloc_free); return map_address; @@ -670,24 +771,20 @@ void *qfree (void *address); #define QCOMMS 0x2 #define QFAST 0x4 -static void alloc_qalloc_free(struct release_t *release){ +static void alloc_qalloc_free(struct alloc_t *alloc_info){ - qfree(release -> address); + qfree(alloc_info); } static void *alloc_qalloc(void *address){ void *map_address; - map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_qalloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_qalloc_free); return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); } @@ -696,9 +793,9 @@ static void *alloc_qalloc(void *address){ #ifdef ALLOC_WINDOWS -static void alloc_windows_free(struct release_t *release){ +static void alloc_windows_free(struct alloc_t *alloc_info){ - VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT); } @@ -706,17 +803,13 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_windows_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_windows_free); return map_address; } @@ -728,13 +821,14 @@ static void *alloc_windows(void *address){ #define DEVICEDRIVER_NAME "/dev/mapper" #endif -static void alloc_devicedirver_free(struct release_t *release){ +static void alloc_devicedirver_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(address, allocation_block_size)) { printf("OpenBLAS : Bugphysarea unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : Bugphysarea close failed.\n"); } @@ -751,17 +845,12 @@ static void *alloc_devicedirver(void *address){ } - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_devicedirver_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd); return map_address; } @@ -770,9 +859,9 @@ static void *alloc_devicedirver(void *address){ #ifdef ALLOC_SHM -static void alloc_shm_free(struct release_t *release){ +static void alloc_shm_free(struct alloc_t *alloc_info){ - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Shared memory unmap failed.\n"); } } @@ -781,22 +870,21 @@ static void *alloc_shm(void *address){ void *map_address; int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600); map_address = (void *)shmat(shmid, address, 0); if (map_address != (void *)-1){ #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif shmctl(shmid, IPC_RMID, 0); - release_info[release_pos].address = map_address; - release_info[release_pos].attr = shmid; - release_info[release_pos].func = alloc_shm_free; - release_pos ++; + struct alloc_t *alloc_info = (struct alloc_t *)map_address; + alloc_info->release_func = alloc_shm_free; + alloc_info->attr = shmid; } return map_address; @@ -804,23 +892,23 @@ static void *alloc_shm(void *address){ #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS -static void alloc_hugetlb_free(struct release_t *release){ +static void alloc_hugetlb_free(struct alloc_t *alloc_info){ #if defined(OS_LINUX) || defined(OS_AIX) - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Hugepage unmap failed.\n"); } #endif #ifdef __sun__ - munmap(release -> address, BUFFER_SIZE); + munmap(alloc_info, allocation_block_size); #endif #ifdef OS_WINDOWS - VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT); #endif @@ -833,7 +921,7 @@ static void *alloc_hugetlb(void *address){ #if defined(OS_LINUX) || defined(OS_AIX) int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, + shmid = shmget(IPC_PRIVATE, allocation_block_size, #ifdef OS_LINUX SHM_HUGETLB | #endif @@ -846,7 +934,7 @@ static void *alloc_hugetlb(void *address){ map_address = (void *)shmat(shmid, address, SHM_RND); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif if (map_address != (void *)-1){ @@ -863,7 +951,7 @@ static void *alloc_hugetlb(void *address){ mha.mha_pagesize = HUGE_PAGESIZE; memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); - map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size); #endif #ifdef OS_WINDOWS @@ -887,7 +975,7 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); @@ -898,11 +986,7 @@ static void *alloc_hugetlb(void *address){ #endif - if (map_address != (void *)-1){ - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_hugetlb_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free); return map_address; } @@ -914,13 +998,14 @@ static void *alloc_hugetlb(void *address){ static int hugetlb_pid = 0; -static void alloc_hugetlbfile_free(struct release_t *release){ +static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : HugeTLBfs unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : HugeTLBfs close failed.\n"); } } @@ -941,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_hugetlbfile_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd); return map_address; } @@ -964,19 +1044,11 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -struct memory_t { - void *addr; - int used; -#ifndef __64BIT__ - char dummy[48]; +#if __STDC_VERSION__ >= 201112L +static _Atomic int memory_initialized = 0; #else - char dummy[40]; +static volatile int memory_initialized = 0; #endif -}; - -static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD]; - -static int memory_initialized = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ @@ -984,6 +1056,20 @@ static int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ +static void blas_memory_init(){ +#if defined(SMP) && !defined(USE_OPENMP) + next_memory_table_pos = 0; +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + local_storage_key = ::TlsAlloc(); +# else + pthread_key_create(&local_storage_key, NULL); +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#endif /* defined(SMP) && !defined(USE_OPENMP) */ + memset(local_memory_table, 0, sizeof(local_memory_table)); +} + void *blas_memory_alloc(int procpos){ int position; @@ -1016,14 +1102,17 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + struct alloc_t * alloc_info; + struct alloc_t ** alloc_table; - if (UNLIKELY_TO_BE_ZERO(memory_initialized)) { - + if (!LIKELY_ONE(memory_initialized)) { +#if defined(SMP) && !defined(USE_OPENMP) /* Only allow a single thread to initialize memory system */ LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { - +#endif + blas_memory_init(); #ifdef DYNAMIC_ARCH gotoblas_dynamic_init(); #endif @@ -1044,8 +1133,10 @@ void *blas_memory_alloc(int procpos){ memory_initialized = 1; +#if defined(SMP) && !defined(USE_OPENMP) } UNLOCK_COMMAND(&alloc_lock); +#endif } #ifdef DEBUG @@ -1053,9 +1144,9 @@ void *blas_memory_alloc(int procpos){ #endif position = 0; - + alloc_table = get_memory_table(); do { - if (!memory[position].used) goto allocation; + if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; } while (position < BUFFERS_PER_THREAD); @@ -1068,9 +1159,8 @@ void *blas_memory_alloc(int procpos){ printf(" Position -> %d\n", position); #endif - memory[position].used = 1; - - if (!memory[position].addr) { + alloc_info = alloc_table[position]; + if (!alloc_info) { do { #ifdef DEBUG printf("Allocation Start : %lx\n", base_address); @@ -1082,7 +1172,7 @@ void *blas_memory_alloc(int procpos){ while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { @@ -1110,23 +1200,24 @@ void *blas_memory_alloc(int procpos){ #endif if (((BLASLONG) map_address) == -1) base_address = 0UL; - if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE; } while ((BLASLONG)map_address == -1); - memory[position].addr = map_address; + alloc_table[position] = alloc_info = map_address; #ifdef DEBUG - printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); + printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position); #endif } #ifdef DEBUG - printf("Mapped : %p %3d\n\n", - (void *)memory[position].addr, position); + printf("Mapped : %p %3d\n\n", (void *)alloc_info, position); #endif - return (void *)memory[position].addr; + alloc_info->used = 1; + + return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); error: printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); @@ -1134,25 +1225,19 @@ void *blas_memory_alloc(int procpos){ return NULL; } -void blas_memory_free(void *free_area){ - +void blas_memory_free(void *buffer){ +#ifdef DEBUG int position; + struct alloc_t ** alloc_table; +#endif + /* Since we passed an offset pointer to the caller, get back to the actual allocation */ + struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t)); #ifdef DEBUG - printf("Unmapped Start : %p ...\n", free_area); + printf("Unmapped Start : %p ...\n", alloc_info); #endif - position = 0; - while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area)) - position++; - - if (memory[position].addr != free_area) goto error; - -#ifdef DEBUG - printf(" Position : %d\n", position); -#endif - - memory[position].used = 0; + alloc_info->used = 0; #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1160,12 +1245,13 @@ void blas_memory_free(void *free_area){ return; - error: - printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); - #ifdef DEBUG - for (position = 0; position < BUFFERS_PER_THREAD; position++) - printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); + alloc_table = get_memory_table(); + for (position = 0; position < BUFFERS_PER_THREAD; position++){ + if (alloc_table[position]) { + printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); + } + } #endif return; } @@ -1182,14 +1268,20 @@ void blas_memory_free_nolock(void * map_address) { void blas_shutdown(void){ - int pos; + int pos, thread; #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - for (pos = 0; pos < release_pos; pos ++) { - release_info[pos].func(&release_info[pos]); + for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ + struct alloc_t *alloc_info = local_memory_table[thread][pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + alloc_info = (void *)0; + } + } } #ifdef SEEK_ADDRESS @@ -1198,11 +1290,6 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - memory[pos].addr = (void *)0; - memory[pos].used = 0; - } - return; } @@ -1226,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size_t size; BLASULONG buffer; - size = BUFFER_SIZE - PAGESIZE; + size = allocation_block_size - PAGESIZE; buffer = (BLASULONG)sa + GEMM_OFFSET_A; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -1247,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, UNLOCK_COMMAND(&init_lock); #endif - size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + size = MIN((allocation_block_size - PAGESIZE), L2_SIZE); buffer = (BLASULONG)sa + GEMM_OFFSET_A; while (size > 0) { From 28c28ed275df2fd812bcdc75fdc04cdb6d9580b3 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 21 Jun 2018 11:13:57 +0100 Subject: [PATCH 70/86] Fix data races reported by TSAN. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ed20cf5cd..7eff16ce3 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -543,9 +543,9 @@ static __inline struct alloc_t ** get_memory_table() { if (!local_memory_table_pos) { LOCK_COMMAND(&alloc_lock); local_memory_table_pos = next_memory_table_pos++; - UNLOCK_COMMAND(&alloc_lock); if (next_memory_table_pos > MAX_ALLOCATING_THREADS) printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); + UNLOCK_COMMAND(&alloc_lock); # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); From 2aa0a5804e381f89a53fdbef9bd51e8af23c8940 Mon Sep 17 00:00:00 2001 From: oon3m0oo Date: Thu, 21 Jun 2018 17:47:45 +0100 Subject: [PATCH 71/86] Use BLAS rather than CBLAS in test_fork.c (#1626) This is handy for people not using lapack. --- utest/CMakeLists.txt | 2 -- utest/Makefile | 2 -- utest/test_fork.c | 22 +++++++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 77a42d84f..1b426afe7 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -25,7 +25,6 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms -if (NOT NO_CBLAS) if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) set(OpenBLAS_utest_src @@ -34,7 +33,6 @@ set(OpenBLAS_utest_src ) endif() endif() -endif() if (NOT NO_LAPACK) set(OpenBLAS_utest_src diff --git a/utest/Makefile b/utest/Makefile index e071540dc..e40b3c6db 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -17,13 +17,11 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch -ifneq ($(NO_CBLAS), 1) ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) OBJS += test_fork.o endif endif -endif all : run_test diff --git a/utest/test_fork.c b/utest/test_fork.c index 9e0244305..9fc51287c 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -48,11 +48,13 @@ void* xmalloc(size_t n) } } -void check_dgemm(double *a, double *b, double *result, double *expected, int n) +void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; int i; - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, result, n); + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); for(i = 0; i < n * n; ++i) { ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); } @@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n) CTEST(fork, safety) { - int n = 1000; + blasint n = 1000; int i; double *a, *b, *c, *d; @@ -84,8 +86,10 @@ CTEST(fork, safety) // Compute a DGEMM product in the parent process prior to forking to // ensure that the OpenBLAS thread pool is initialized. - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, c, n); + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n); fork_pid = fork(); if (fork_pid == -1) { From 9cf22b7d9129e186a1ee941fbab8e45328c50b61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 13:27:30 +0200 Subject: [PATCH 72/86] Build cblas_iXamin interfaces --- interface/Makefile | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 9b2b93b83..20ec74e9e 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) CSBLAS1OBJS = \ - cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ + cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) CDBLAS1OBJS = \ - cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ + cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ cblas_dgeadd.$(SUFFIX) CCBLAS1OBJS = \ - cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ + cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_ccopy.$(SUFFIX) \ cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ CZBLAS1OBJS = \ - cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ + cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) +cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) From eb71d61c7cb6640e66a5239d1113de8a8c1477df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 13:31:09 +0200 Subject: [PATCH 73/86] Expose CBLAS interface to BLAS extensions iXamin --- cblas.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cblas.h b/cblas.h index 89f78c133..6461f4209 100644 --- a/cblas.h +++ b/cblas.h @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); From 0b2b83d9ed91e5e9234e41b1d41b0a7f21f5234c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 19:41:32 +0200 Subject: [PATCH 74/86] Add support for a user-defined list of dynamic targets --- Makefile.system | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 62ba0e466..4712d9525 100644 --- a/Makefile.system +++ b/Makefile.system @@ -248,7 +248,7 @@ endif ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET -export MACOSX_DEPLOYMENT_TARGET=10.6 +export MACOSX_DEPLOYMENT_TARGET=10.8 endif MD5SUM = md5 -r endif @@ -497,6 +497,14 @@ endif endif endif +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +CCOMMON_OPT += $(XCCOMMON_OPT) +#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= From 1833a6707157abe966f39dcac90530c2461117d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 19:42:15 +0200 Subject: [PATCH 75/86] Add support for a user-defined list of dynamic targets --- driver/others/dynamic.c | 139 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4271c0a0d..d5ed6d164 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -49,6 +49,127 @@ #define EXTERN #endif +#ifdef DYNAMIC_LIST +extern gotoblas_t gotoblas_PRESCOTT; + +#ifdef DYN_ATHLON +extern gotoblas_t gotoblas_ATHLON; +#else +#define gotoblas_ATHLON gotoblas_PRESCOTT +#endif +#ifdef DYN_KATMAI +extern gotoblas_t gotoblas_KATMAI; +#else +#define gotoblas_KATMAI gotoblas_PRESCOTT +#endif +#ifdef DYN_BANIAS +extern gotoblas_t gotoblas_BANIAS; +#else +#define gotoblas_BANIAS gotoblas_PRESCOTT +#endif +#ifdef DYN_COPPERMINE +extern gotoblas_t gotoblas_COPPERMINE; +#else +#define gotoblas_COPPERMINE gotoblas_PRESCOTT +#endif +#ifdef DYN_NORTHWOOD +extern gotoblas_t gotoblas_NORTHWOOD; +#else +#define gotoblas_NORTHWOOD gotoblas_PRESCOTT +#endif +#ifdef DYN_CORE2 +extern gotoblas_t gotoblas_CORE2; +#else +#define gotoblas_CORE2 gotoblas_PRESCOTT +#endif +#ifdef DYN_NEHALEM +extern gotoblas_t gotoblas_NEHALEM; +#else +#define gotoblas_NEHALEM gotoblas_PRESCOTT +#endif +#ifdef DYN_BARCELONA +extern gotoblas_t gotoblas_BARCELONA; +#else +#define gotoblas_BARCELONA gotoblas_PRESCOTT +#endif +#ifdef DYN_ATOM +extern gotoblas_t gotoblas_ATOM; +#else +#define gotoblas_ATOM gotoblas_PRESCOTT +#endif +#ifdef DYN_NANO +extern gotoblas_t gotoblas_NANO; +#else +#define gotoblas_NANO gotoblas_PRESCOTT +#endif +#ifdef DYN_PENRYN +extern gotoblas_t gotoblas_PENRYN; +#else +#define gotoblas_PENRYN gotoblas_PRESCOTT +#endif +#ifdef DYN_DUNNINGTON +extern gotoblas_t gotoblas_DUNNINGTON; +#else +#define gotoblas_DUNNINGTON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON +extern gotoblas_t gotoblas_OPTERON; +#else +#define gotoblas_OPTERON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON_SSE3 +extern gotoblas_t gotoblas_OPTERON_SSE3; +#else +#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT +#endif +#ifdef DYN_BOBCAT +extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_BOBCAT gotoblas_PRESCOTT +#endif +#ifdef DYN_SANDYBRIDGE +extern gotoblas_t gotoblas_SANDYBRIDGE; +#else +#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT +#endif +#ifdef DYN_BULLDOZER +extern gotoblas_t gotoblas_BULLDOZER; +#else +#define gotoblas_BULLDOZER gotoblas_PRESCOTT +#endif +#ifdef DYN_PILEDRIVER +extern gotoblas_t gotoblas_PILEDRIVER; +#else +#define gotoblas_PILEDRIVER gotoblas_PRESCOTT +#endif +#ifdef DYN_STEAMROLLER +extern gotoblas_t gotoblas_STEAMROLLER; +#else +#define gotoblas_STEAMROLLER gotoblas_PRESCOTT +#endif +#ifdef DYN_EXCAVATOR +extern gotoblas_t gotoblas_EXCAVATOR; +#else +#define gotoblas_EXCAVATOR gotoblas_PRESCOTT +#endif +#ifdef DYN_HASWELL +extern gotoblas_t gotoblas_HASWELL; +#else +#define gotoblas_HASWELL gotoblas_PRESCOTT +#endif +#ifdef DYN_ZEN +extern gotoblas_t gotoblas_ZEN; +#else +#define gotoblas_ZEN gotoblas_PRESCOTT +#endif +#ifdef DYN_SKYLAKEX +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_PRESCOTT +#endif + + +#else // not DYNAMIC_LIST EXTERN gotoblas_t gotoblas_KATMAI; EXTERN gotoblas_t gotoblas_COPPERMINE; EXTERN gotoblas_t gotoblas_NORTHWOOD; @@ -108,6 +229,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define gotoblas_ZEN gotoblas_BARCELONA #endif +#endif // DYNAMIC_LIST #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -338,6 +460,23 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } return NULL; + case 6: + if (model == 6) { + // Cannon Lake +#ifndef NO_AVX512 + return &gotoblas_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return &gotoblas_HASWELL; +#else + return &gotblas_SANDYBRIDGE; +#endif + else + return &gotoblas_NEHALEM; +#endif + } + return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake From 01440685379f11f158c5f612cf15fc279eb16c88 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Mon, 25 Jun 2018 13:53:11 +0100 Subject: [PATCH 76/86] Rewrite &= -> = and simplify the initial blocking phase. --- driver/level3/level3_thread.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4..ee3e3b9a9 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; - STOP_RPCC(waiting1); - #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - /* Set flag so other threads can access local region of B */ - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + /* Set flag so other threads can access local region of B */ job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; + WMB; + } } /* Get regions of B from other threads and apply kernel */ @@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; WMB; } } From 750162a05f8c6d0d9530955f78e8e6bb138d8df9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Jun 2018 21:02:31 +0200 Subject: [PATCH 77/86] Try gradual fallback for cores not in the dynamic core list --- driver/others/dynamic.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index d5ed6d164..13794207c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -89,11 +89,15 @@ extern gotoblas_t gotoblas_NEHALEM; #endif #ifdef DYN_BARCELONA extern gotoblas_t gotoblas_BARCELONA; +#elif defined(DYN_NEHALEM) +#define gotoblas_BARCELONA gotoblas_NEHALEM #else #define gotoblas_BARCELONA gotoblas_PRESCOTT #endif #ifdef DYN_ATOM extern gotoblas_t gotoblas_ATOM; +elif defined(DYN_NEHALEM) +#define gotoblas_ATOM gotoblas_NEHALEM #else #define gotoblas_ATOM gotoblas_PRESCOTT #endif @@ -124,46 +128,82 @@ extern gotoblas_t gotoblas_OPTERON_SSE3; #endif #ifdef DYN_BOBCAT extern gotoblas_t gotoblas_BOBCAT; +#elif defined(DYN_NEHALEM) +#define gotoblas_BOBCAT gotoblas_NEHALEM #else #define gotoblas_BOBCAT gotoblas_PRESCOTT #endif #ifdef DYN_SANDYBRIDGE extern gotoblas_t gotoblas_SANDYBRIDGE; +#elif defined(DYN_NEHALEM) +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #else #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT #endif #ifdef DYN_BULLDOZER extern gotoblas_t gotoblas_BULLDOZER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_BULLDOZER gotoblas_NEHALEM #else #define gotoblas_BULLDOZER gotoblas_PRESCOTT #endif #ifdef DYN_PILEDRIVER extern gotoblas_t gotoblas_PILEDRIVER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_PILEDRIVER gotoblas_NEHALEM #else #define gotoblas_PILEDRIVER gotoblas_PRESCOTT #endif #ifdef DYN_STEAMROLLER extern gotoblas_t gotoblas_STEAMROLLER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_STEAMROLLER gotoblas_NEHALEM #else #define gotoblas_STEAMROLLER gotoblas_PRESCOTT #endif #ifdef DYN_EXCAVATOR extern gotoblas_t gotoblas_EXCAVATOR; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_EXCAVATOR gotoblas_NEHALEM #else #define gotoblas_EXCAVATOR gotoblas_PRESCOTT #endif #ifdef DYN_HASWELL extern gotoblas_t gotoblas_HASWELL; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_HASWELL gotoblas_NEHALEM #else #define gotoblas_HASWELL gotoblas_PRESCOTT #endif #ifdef DYN_ZEN extern gotoblas_t gotoblas_ZEN; +#elif defined(DYN_HASWELL) +#define gotoblas_ZEN gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_ZEN gotoblas_NEHALEM #else #define gotoblas_ZEN gotoblas_PRESCOTT #endif #ifdef DYN_SKYLAKEX extern gotoblas_t gotoblas_SKYLAKEX; +#elif defined(DYN_HASWELL) +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SKYLAKEX gotoblas_NEHALEM #else #define gotoblas_SKYLAKEX gotoblas_PRESCOTT #endif From 092175cfec7d49d40904aeff1d8121acb4ed1452 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 08:09:52 +0200 Subject: [PATCH 78/86] Revert changes to NOFORTRAN handling from 952541e --- Makefile | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 49dab6484..56b4426f8 100644 --- a/Makefile +++ b/Makefile @@ -21,17 +21,6 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif -ifeq ($(NO_FORTRAN), 1) -define NOFORTRAN -1 -endef -define NO_LAPACK -1 -endef -export NOFORTRAN -export NO_LAPACK -endif - LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench @@ -58,7 +47,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +108,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +210,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,10 +231,7 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : - $(info filter value of NOFORTRAN is:) - $(info x$(filter-out $(NOFORTRAN), 1 2)x) - -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -288,21 +274,21 @@ endif endif large.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From e322a951febc933e0bae192dcb117e447df24050 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:44:13 +0200 Subject: [PATCH 79/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/cdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index e5a6e4d35..fd86a37b0 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cdot_kernel_L999 - cmp INC_X, #0 - beq cdot_kernel_L999 +# cmp INC_X, #0 +# beq cdot_kernel_L999 - cmp INC_Y, #0 - beq cdot_kernel_L999 +# cmp INC_Y, #0 +# beq cdot_kernel_L999 cmp INC_X, #1 bne cdot_kernel_S_BEGIN From 545b82efd30e4e0a33cb57bb7c6fb12601a6d3d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:45:00 +0200 Subject: [PATCH 80/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/ddot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index fb294d8b4..cc2e485b7 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble ddot_kernel_L999 - cmp INC_X, #0 - beq ddot_kernel_L999 +# cmp INC_X, #0 +# beq ddot_kernel_L999 - cmp INC_Y, #0 - beq ddot_kernel_L999 +# cmp INC_Y, #0 +# beq ddot_kernel_L999 cmp INC_X, #1 bne ddot_kernel_S_BEGIN From e344db269b5b45d08ff4ce60801de0ece0965866 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:45:57 +0200 Subject: [PATCH 81/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/sdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index 5f4f424bf..544846258 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -253,11 +253,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble sdot_kernel_L999 - cmp INC_X, #0 - beq sdot_kernel_L999 +# cmp INC_X, #0 +# beq sdot_kernel_L999 - cmp INC_Y, #0 - beq sdot_kernel_L999 +# cmp INC_Y, #0 +# beq sdot_kernel_L999 cmp INC_X, #1 bne sdot_kernel_S_BEGIN From b83e4c60c73e80269e84b46590005d622d05e6d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:46:42 +0200 Subject: [PATCH 82/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/zdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 43f2c0c0b..c0cd92d3c 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -218,11 +218,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zdot_kernel_L999 - cmp INC_X, #0 - beq zdot_kernel_L999 +# cmp INC_X, #0 +# beq zdot_kernel_L999 - cmp INC_Y, #0 - beq zdot_kernel_L999 +# cmp INC_Y, #0 +# beq zdot_kernel_L999 cmp INC_X, #1 bne zdot_kernel_S_BEGIN From f0a8dc2eec86a20a1486034a999c36709e699266 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 11:34:48 +0200 Subject: [PATCH 83/86] Disable the AVX512 DGEMM kernel for now due to #1643 --- kernel/x86_64/KERNEL.SKYLAKEX | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index c273ff8cd..2deb41b08 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -4,16 +4,16 @@ SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S DTRMMKERNEL = ../generic/trmmkernel_16x2.c -DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S +#DGEMMINCOPY = ../generic/gemm_ncopy_16.c +#DGEMMITCOPY = ../generic/gemm_tcopy_16.c +#DGEMMONCOPY = ../generic/gemm_ncopy_2.c +#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMM_BETA = ../generic/gemm_beta.c -DGEMM_BETA = ../generic/gemm_beta.c \ No newline at end of file +DGEMM_BETA = ../generic/gemm_beta.c From 6e54b0a027437303e425382c7e5611c1e860632f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 17:31:06 +0200 Subject: [PATCH 84/86] Disable the 16x2 DTRMM kernel on SkylakeX as well --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 2deb41b08..1256f4c3c 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -3,7 +3,7 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S -DTRMMKERNEL = ../generic/trmmkernel_16x2.c +#DTRMMKERNEL = ../generic/trmmkernel_16x2.c #DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S #DGEMMINCOPY = ../generic/gemm_ncopy_16.c #DGEMMITCOPY = ../generic/gemm_tcopy_16.c From f5243e8e1fc585147e8b6e1553232f5f868eff1d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 23:47:44 +0200 Subject: [PATCH 85/86] Add compiler option to avx512 test and hide test output --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index cc64c16c6..3831d7aa3 100644 --- a/c_check +++ b/c_check @@ -205,8 +205,8 @@ $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; print $tmpf "int main(void){ __asm__ volatile($code); }\n"; - $args = " -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); + $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $no_avx512 = 1; From 4e9c34018e06615ea2c0c64551691e297682e7a3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 23:57:50 +0200 Subject: [PATCH 86/86] Fix apparent off-by-one error in calculation of MAX_ALLOCATING_THREADS fixes #1641 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 7eff16ce3..98bcfb216 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -497,7 +497,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #if defined(SMP) && !defined(USE_OPENMP) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 static int next_memory_table_pos = 0; # if defined(HAS_COMPILER_TLS) /* Use compiler generated thread-local-storage */