Fixed #395. Enable optimized cgemm for Sandybridge. Added optimized sdot kernel.

Fixed c/zgemm, zgemv computational error of haswell, piledriver, bullldozer, and
barcelona on Windows.

Merge branch 'develop' of https://github.com/wernsaar/OpenBLAS into wernsaar-develop

Conflicts:
	kernel/Makefile.L1
	kernel/x86_64/KERNEL
	param.h
This commit is contained in:
Zhang Xianyi 2014-06-29 10:34:51 +08:00
commit 99efbbbad5
19 changed files with 2473 additions and 151 deletions

View File

@ -23,7 +23,7 @@ endif
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install .PHONY : all libs netlib test ctest shared install
.NOTPARALLEL : all libs prof lapack-test install .NOTPARALLEL : all libs prof lapack-test install blas-test
all :: libs netlib tests shared all :: libs netlib tests shared
@echo @echo
@ -282,6 +282,11 @@ lapack-test :
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
dummy : dummy :

View File

@ -687,15 +687,27 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
ifdef DSDOTKERNEL
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
else
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
endif
$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) $(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@

View File

@ -119,9 +119,15 @@ XCOPYKERNEL = zcopy.S
endif endif
ifndef SDOTKERNEL ifndef SDOTKERNEL
SDOTKERNEL = ../arm/dot.c SDOTKERNEL = dot_sse.S
endif endif
ifndef DSDOTKERNEL
DSDOTKERNEL = ../arm/dot.c
endif
ifndef DDOTKERNEL ifndef DDOTKERNEL
DDOTKERNEL = dot_sse2.S DDOTKERNEL = dot_sse2.S
endif endif

View File

@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S SGEMVTKERNEL = sgemv_t.S
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c

View File

@ -1,5 +1,5 @@
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c

View File

@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S SGEMVTKERNEL = sgemv_t.S
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t.S
DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S

View File

@ -1,5 +1,5 @@
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_sse.S SGEMMKERNEL = gemm_kernel_8x4_sse.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c

View File

@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S SGEMVTKERNEL = sgemv_t.S
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t.S
DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S

View File

@ -1,5 +1,5 @@
ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t.S
SGEMMKERNEL = gemm_kernel_8x4_sse3.S SGEMMKERNEL = gemm_kernel_8x4_sse3.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c

View File

@ -21,11 +21,11 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S CGEMMKERNEL = cgemm_kernel_8x2_sandy.S
CGEMMINCOPY = zgemm_ncopy_2.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = zgemm_tcopy_2.S CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)

View File

@ -522,16 +522,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp) vmovups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp) vmovups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp) vmovups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp) vmovups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp) vmovups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp) vmovups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp) vmovups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp) vmovups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp) vmovups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp) vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@ -541,14 +541,15 @@
movq OLD_C, C movq OLD_C, C
movq OLD_LDC, LDC movq OLD_LDC, LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12 vmovsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12 vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif #endif
#endif #endif
@ -1865,6 +1866,8 @@
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@ -1876,16 +1879,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6 vmovups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7 vmovups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8 vmovups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9 vmovups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10 vmovups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11 vmovups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12 vmovups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13 vmovups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14 vmovups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15 vmovups 208(%rsp), %xmm15
#endif #endif
addq $STACKSIZE, %rsp addq $STACKSIZE, %rsp

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/********************************************************************* /*********************************************************************
* *
* 2013/10/31 Saar * 2014/06/28 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -546,16 +546,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp) vmovups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp) vmovups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp) vmovups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp) vmovups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp) vmovups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp) vmovups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp) vmovups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp) vmovups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp) vmovups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp) vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@ -568,6 +568,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movsd OLD_OFFSET, %xmm12 movsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
@ -1889,6 +1890,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@ -1900,16 +1903,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6 vmovups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7 vmovups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8 vmovups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9 vmovups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10 vmovups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11 vmovups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12 vmovups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13 vmovups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14 vmovups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15 vmovups 208(%rsp), %xmm15
#endif #endif
addq $STACKSIZE, %rsp addq $STACKSIZE, %rsp

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
/********************************************************************* /*********************************************************************
* 2013/11/13 Saar * 2014/06/28 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -816,16 +816,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp) vmovups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp) vmovups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp) vmovups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp) vmovups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp) vmovups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp) vmovups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp) vmovups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp) vmovups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp) vmovups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp) vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@ -838,6 +838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movsd OLD_OFFSET, %xmm12 movsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
@ -2253,6 +2254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@ -2264,16 +2267,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6 vmovups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7 vmovups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8 vmovups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9 vmovups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10 vmovups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11 vmovups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12 vmovups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13 vmovups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14 vmovups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15 vmovups 208(%rsp), %xmm15
#endif #endif
addq $ STACKSIZE, %rsp addq $ STACKSIZE, %rsp

File diff suppressed because it is too large Load Diff

View File

@ -412,16 +412,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp) vmovups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp) vmovups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp) vmovups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp) vmovups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp) vmovups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp) vmovups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp) vmovups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp) vmovups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp) vmovups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp) vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@ -431,14 +431,15 @@
movq OLD_C, C movq OLD_C, C
movq OLD_LDC, LDC movq OLD_LDC, LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12 vmovsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12 vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif #endif
#endif #endif
@ -1372,6 +1373,8 @@
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@ -1383,16 +1386,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6 vmovups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7 vmovups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8 vmovups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9 vmovups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10 vmovups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11 vmovups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12 vmovups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13 vmovups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14 vmovups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15 vmovups 208(%rsp), %xmm15
#endif #endif
addq $STACKSIZE, %rsp addq $STACKSIZE, %rsp

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/********************************************************************* /*********************************************************************
* *
* 2013/10/30 Saar * 2014/06/28 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -437,16 +437,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp) vmovups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp) vmovups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp) vmovups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp) vmovups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp) vmovups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp) vmovups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp) vmovups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp) vmovups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp) vmovups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp) vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@ -456,14 +456,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movq OLD_C, C movq OLD_C, C
movq OLD_LDC, LDC movq OLD_LDC, LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12 vmovsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12 vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif #endif
#endif #endif
@ -1397,6 +1398,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@ -1408,16 +1411,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6 vmovups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7 vmovups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8 vmovups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9 vmovups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10 vmovups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11 vmovups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12 vmovups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13 vmovups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14 vmovups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15 vmovups 208(%rsp), %xmm15
#endif #endif
addq $STACKSIZE, %rsp addq $STACKSIZE, %rsp

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
/******************************************************************************** /********************************************************************************
* 2013/11/13 Saar * 2014/06/28 Saar
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -693,16 +693,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp) vmovups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp) vmovups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp) vmovups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp) vmovups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp) vmovups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp) vmovups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp) vmovups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp) vmovups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp) vmovups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp) vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@ -715,6 +715,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
movsd OLD_OFFSET, %xmm12 movsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
@ -1781,6 +1782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@ -1792,16 +1795,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6 vmovups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7 vmovups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8 vmovups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9 vmovups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10 vmovups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11 vmovups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12 vmovups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13 vmovups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14 vmovups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15 vmovups 208(%rsp), %xmm15
#endif #endif
addq $ STACKSIZE, %rsp addq $ STACKSIZE, %rsp

View File

@ -1,7 +1,7 @@
SHELL = /bin/sh SHELL = /bin/sh
PLAT = _LINUX PLAT = _LINUX
DRVOPTS = $(OPTS) DRVOPTS = $(OPTS)
LOADER = $(FORTRAN) -pthread LOADER = $(FORTRAN)
ARCHFLAGS= -ru ARCHFLAGS= -ru
#RANLIB = ranlib #RANLIB = ranlib

View File

@ -1111,14 +1111,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1
#endif #endif
@ -1134,7 +1134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_P 504
#define QGEMM_DEFAULT_R qgemm_r #define QGEMM_DEFAULT_R qgemm_r
#define CGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 384
//#define CGEMM_DEFAULT_R cgemm_r //#define CGEMM_DEFAULT_R cgemm_r
#define CGEMM_DEFAULT_R 1024 #define CGEMM_DEFAULT_R 1024
@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 192
#define ZGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128