Fixed #395. Enable optimized cgemm for Sandybridge. Added optimized sdot kernel.
Fixed c/zgemm, zgemv computational error of haswell, piledriver, bullldozer, and barcelona on Windows. Merge branch 'develop' of https://github.com/wernsaar/OpenBLAS into wernsaar-develop Conflicts: kernel/Makefile.L1 kernel/x86_64/KERNEL param.h
This commit is contained in:
commit
99efbbbad5
7
Makefile
7
Makefile
|
@ -23,7 +23,7 @@ endif
|
|||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
|
||||
.PHONY : all libs netlib test ctest shared install
|
||||
.NOTPARALLEL : all libs prof lapack-test install
|
||||
.NOTPARALLEL : all libs prof lapack-test install blas-test
|
||||
|
||||
all :: libs netlib tests shared
|
||||
@echo
|
||||
|
@ -282,6 +282,11 @@ lapack-test :
|
|||
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
|
||||
blas-test:
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
|
||||
make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
|
||||
|
||||
|
||||
dummy :
|
||||
|
||||
|
|
|
@ -687,15 +687,27 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
|
|||
$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
|
||||
|
||||
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
ifdef DSDOTKERNEL
|
||||
|
||||
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
|
||||
|
||||
$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
|
||||
|
||||
else
|
||||
|
||||
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
|
||||
|
||||
$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
|
||||
|
||||
endif
|
||||
|
||||
$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@
|
||||
|
||||
|
|
|
@ -119,9 +119,15 @@ XCOPYKERNEL = zcopy.S
|
|||
endif
|
||||
|
||||
ifndef SDOTKERNEL
|
||||
SDOTKERNEL = ../arm/dot.c
|
||||
SDOTKERNEL = dot_sse.S
|
||||
endif
|
||||
|
||||
|
||||
ifndef DSDOTKERNEL
|
||||
DSDOTKERNEL = ../arm/dot.c
|
||||
endif
|
||||
|
||||
|
||||
ifndef DDOTKERNEL
|
||||
DDOTKERNEL = dot_sse2.S
|
||||
endif
|
||||
|
|
|
@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S
|
|||
SGEMVTKERNEL = sgemv_t.S
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
|
|
|
@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S
|
|||
SGEMVTKERNEL = sgemv_t.S
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_8x4_sse.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
|
|
|
@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S
|
|||
SGEMVTKERNEL = sgemv_t.S
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_8x4_sse3.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
|
|
|
@ -21,11 +21,11 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
|
||||
CGEMMINCOPY = zgemm_ncopy_2.S
|
||||
CGEMMITCOPY = zgemm_tcopy_2.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMKERNEL = cgemm_kernel_8x2_sandy.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
@ -522,16 +522,16 @@
|
|||
#ifdef WINDOWS_ABI
|
||||
movq %rdi, 48(%rsp)
|
||||
movq %rsi, 56(%rsp)
|
||||
movups %xmm6, 64(%rsp)
|
||||
movups %xmm7, 80(%rsp)
|
||||
movups %xmm8, 96(%rsp)
|
||||
movups %xmm9, 112(%rsp)
|
||||
movups %xmm10, 128(%rsp)
|
||||
movups %xmm11, 144(%rsp)
|
||||
movups %xmm12, 160(%rsp)
|
||||
movups %xmm13, 176(%rsp)
|
||||
movups %xmm14, 192(%rsp)
|
||||
movups %xmm15, 208(%rsp)
|
||||
vmovups %xmm6, 64(%rsp)
|
||||
vmovups %xmm7, 80(%rsp)
|
||||
vmovups %xmm8, 96(%rsp)
|
||||
vmovups %xmm9, 112(%rsp)
|
||||
vmovups %xmm10, 128(%rsp)
|
||||
vmovups %xmm11, 144(%rsp)
|
||||
vmovups %xmm12, 160(%rsp)
|
||||
vmovups %xmm13, 176(%rsp)
|
||||
vmovups %xmm14, 192(%rsp)
|
||||
vmovups %xmm15, 208(%rsp)
|
||||
|
||||
movq ARG1, OLD_M
|
||||
movq ARG2, OLD_N
|
||||
|
@ -541,14 +541,15 @@
|
|||
movq OLD_C, C
|
||||
movq OLD_LDC, LDC
|
||||
#ifdef TRMMKERNEL
|
||||
movsd OLD_OFFSET, %xmm12
|
||||
vmovsd OLD_OFFSET, %xmm12
|
||||
#endif
|
||||
vmovaps %xmm3, %xmm0
|
||||
vmovsd OLD_ALPHA_I, %xmm1
|
||||
|
||||
#else
|
||||
movq STACKSIZE + 8(%rsp), LDC
|
||||
#ifdef TRMMKERNEL
|
||||
movsd STACKSIZE + 16(%rsp), %xmm12
|
||||
vmovsd STACKSIZE + 16(%rsp), %xmm12
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -1865,6 +1866,8 @@
|
|||
|
||||
|
||||
.L999:
|
||||
vzeroupper
|
||||
|
||||
movq SP, %rsp
|
||||
movq (%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
|
@ -1876,16 +1879,16 @@
|
|||
#ifdef WINDOWS_ABI
|
||||
movq 48(%rsp), %rdi
|
||||
movq 56(%rsp), %rsi
|
||||
movups 64(%rsp), %xmm6
|
||||
movups 80(%rsp), %xmm7
|
||||
movups 96(%rsp), %xmm8
|
||||
movups 112(%rsp), %xmm9
|
||||
movups 128(%rsp), %xmm10
|
||||
movups 144(%rsp), %xmm11
|
||||
movups 160(%rsp), %xmm12
|
||||
movups 176(%rsp), %xmm13
|
||||
movups 192(%rsp), %xmm14
|
||||
movups 208(%rsp), %xmm15
|
||||
vmovups 64(%rsp), %xmm6
|
||||
vmovups 80(%rsp), %xmm7
|
||||
vmovups 96(%rsp), %xmm8
|
||||
vmovups 112(%rsp), %xmm9
|
||||
vmovups 128(%rsp), %xmm10
|
||||
vmovups 144(%rsp), %xmm11
|
||||
vmovups 160(%rsp), %xmm12
|
||||
vmovups 176(%rsp), %xmm13
|
||||
vmovups 192(%rsp), %xmm14
|
||||
vmovups 208(%rsp), %xmm15
|
||||
#endif
|
||||
|
||||
addq $STACKSIZE, %rsp
|
||||
|
|
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
/*********************************************************************
|
||||
*
|
||||
* 2013/10/31 Saar
|
||||
* 2014/06/28 Saar
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -546,16 +546,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq %rdi, 48(%rsp)
|
||||
movq %rsi, 56(%rsp)
|
||||
movups %xmm6, 64(%rsp)
|
||||
movups %xmm7, 80(%rsp)
|
||||
movups %xmm8, 96(%rsp)
|
||||
movups %xmm9, 112(%rsp)
|
||||
movups %xmm10, 128(%rsp)
|
||||
movups %xmm11, 144(%rsp)
|
||||
movups %xmm12, 160(%rsp)
|
||||
movups %xmm13, 176(%rsp)
|
||||
movups %xmm14, 192(%rsp)
|
||||
movups %xmm15, 208(%rsp)
|
||||
vmovups %xmm6, 64(%rsp)
|
||||
vmovups %xmm7, 80(%rsp)
|
||||
vmovups %xmm8, 96(%rsp)
|
||||
vmovups %xmm9, 112(%rsp)
|
||||
vmovups %xmm10, 128(%rsp)
|
||||
vmovups %xmm11, 144(%rsp)
|
||||
vmovups %xmm12, 160(%rsp)
|
||||
vmovups %xmm13, 176(%rsp)
|
||||
vmovups %xmm14, 192(%rsp)
|
||||
vmovups %xmm15, 208(%rsp)
|
||||
|
||||
movq ARG1, OLD_M
|
||||
movq ARG2, OLD_N
|
||||
|
@ -568,6 +568,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
movsd OLD_OFFSET, %xmm12
|
||||
#endif
|
||||
vmovaps %xmm3, %xmm0
|
||||
vmovsd OLD_ALPHA_I, %xmm1
|
||||
|
||||
#else
|
||||
movq STACKSIZE + 8(%rsp), LDC
|
||||
|
@ -1889,6 +1890,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.L999:
|
||||
vzeroupper
|
||||
|
||||
movq SP, %rsp
|
||||
movq (%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
|
@ -1900,16 +1903,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq 48(%rsp), %rdi
|
||||
movq 56(%rsp), %rsi
|
||||
movups 64(%rsp), %xmm6
|
||||
movups 80(%rsp), %xmm7
|
||||
movups 96(%rsp), %xmm8
|
||||
movups 112(%rsp), %xmm9
|
||||
movups 128(%rsp), %xmm10
|
||||
movups 144(%rsp), %xmm11
|
||||
movups 160(%rsp), %xmm12
|
||||
movups 176(%rsp), %xmm13
|
||||
movups 192(%rsp), %xmm14
|
||||
movups 208(%rsp), %xmm15
|
||||
vmovups 64(%rsp), %xmm6
|
||||
vmovups 80(%rsp), %xmm7
|
||||
vmovups 96(%rsp), %xmm8
|
||||
vmovups 112(%rsp), %xmm9
|
||||
vmovups 128(%rsp), %xmm10
|
||||
vmovups 144(%rsp), %xmm11
|
||||
vmovups 160(%rsp), %xmm12
|
||||
vmovups 176(%rsp), %xmm13
|
||||
vmovups 192(%rsp), %xmm14
|
||||
vmovups 208(%rsp), %xmm15
|
||||
#endif
|
||||
|
||||
addq $STACKSIZE, %rsp
|
||||
|
|
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
**********************************************************************************/
|
||||
|
||||
/*********************************************************************
|
||||
* 2013/11/13 Saar
|
||||
* 2014/06/28 Saar
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -816,16 +816,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq %rdi, 48(%rsp)
|
||||
movq %rsi, 56(%rsp)
|
||||
movups %xmm6, 64(%rsp)
|
||||
movups %xmm7, 80(%rsp)
|
||||
movups %xmm8, 96(%rsp)
|
||||
movups %xmm9, 112(%rsp)
|
||||
movups %xmm10, 128(%rsp)
|
||||
movups %xmm11, 144(%rsp)
|
||||
movups %xmm12, 160(%rsp)
|
||||
movups %xmm13, 176(%rsp)
|
||||
movups %xmm14, 192(%rsp)
|
||||
movups %xmm15, 208(%rsp)
|
||||
vmovups %xmm6, 64(%rsp)
|
||||
vmovups %xmm7, 80(%rsp)
|
||||
vmovups %xmm8, 96(%rsp)
|
||||
vmovups %xmm9, 112(%rsp)
|
||||
vmovups %xmm10, 128(%rsp)
|
||||
vmovups %xmm11, 144(%rsp)
|
||||
vmovups %xmm12, 160(%rsp)
|
||||
vmovups %xmm13, 176(%rsp)
|
||||
vmovups %xmm14, 192(%rsp)
|
||||
vmovups %xmm15, 208(%rsp)
|
||||
|
||||
movq ARG1, OLD_M
|
||||
movq ARG2, OLD_N
|
||||
|
@ -838,6 +838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
movsd OLD_OFFSET, %xmm12
|
||||
#endif
|
||||
vmovaps %xmm3, %xmm0
|
||||
vmovsd OLD_ALPHA_I, %xmm1
|
||||
|
||||
#else
|
||||
movq STACKSIZE + 8(%rsp), LDC
|
||||
|
@ -2253,6 +2254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.L999:
|
||||
vzeroupper
|
||||
|
||||
movq SP, %rsp
|
||||
movq (%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
|
@ -2264,16 +2267,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq 48(%rsp), %rdi
|
||||
movq 56(%rsp), %rsi
|
||||
movups 64(%rsp), %xmm6
|
||||
movups 80(%rsp), %xmm7
|
||||
movups 96(%rsp), %xmm8
|
||||
movups 112(%rsp), %xmm9
|
||||
movups 128(%rsp), %xmm10
|
||||
movups 144(%rsp), %xmm11
|
||||
movups 160(%rsp), %xmm12
|
||||
movups 176(%rsp), %xmm13
|
||||
movups 192(%rsp), %xmm14
|
||||
movups 208(%rsp), %xmm15
|
||||
vmovups 64(%rsp), %xmm6
|
||||
vmovups 80(%rsp), %xmm7
|
||||
vmovups 96(%rsp), %xmm8
|
||||
vmovups 112(%rsp), %xmm9
|
||||
vmovups 128(%rsp), %xmm10
|
||||
vmovups 144(%rsp), %xmm11
|
||||
vmovups 160(%rsp), %xmm12
|
||||
vmovups 176(%rsp), %xmm13
|
||||
vmovups 192(%rsp), %xmm14
|
||||
vmovups 208(%rsp), %xmm15
|
||||
#endif
|
||||
|
||||
addq $ STACKSIZE, %rsp
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -412,16 +412,16 @@
|
|||
#ifdef WINDOWS_ABI
|
||||
movq %rdi, 48(%rsp)
|
||||
movq %rsi, 56(%rsp)
|
||||
movups %xmm6, 64(%rsp)
|
||||
movups %xmm7, 80(%rsp)
|
||||
movups %xmm8, 96(%rsp)
|
||||
movups %xmm9, 112(%rsp)
|
||||
movups %xmm10, 128(%rsp)
|
||||
movups %xmm11, 144(%rsp)
|
||||
movups %xmm12, 160(%rsp)
|
||||
movups %xmm13, 176(%rsp)
|
||||
movups %xmm14, 192(%rsp)
|
||||
movups %xmm15, 208(%rsp)
|
||||
vmovups %xmm6, 64(%rsp)
|
||||
vmovups %xmm7, 80(%rsp)
|
||||
vmovups %xmm8, 96(%rsp)
|
||||
vmovups %xmm9, 112(%rsp)
|
||||
vmovups %xmm10, 128(%rsp)
|
||||
vmovups %xmm11, 144(%rsp)
|
||||
vmovups %xmm12, 160(%rsp)
|
||||
vmovups %xmm13, 176(%rsp)
|
||||
vmovups %xmm14, 192(%rsp)
|
||||
vmovups %xmm15, 208(%rsp)
|
||||
|
||||
movq ARG1, OLD_M
|
||||
movq ARG2, OLD_N
|
||||
|
@ -431,14 +431,15 @@
|
|||
movq OLD_C, C
|
||||
movq OLD_LDC, LDC
|
||||
#ifdef TRMMKERNEL
|
||||
movsd OLD_OFFSET, %xmm12
|
||||
vmovsd OLD_OFFSET, %xmm12
|
||||
#endif
|
||||
vmovaps %xmm3, %xmm0
|
||||
vmovsd OLD_ALPHA_I, %xmm1
|
||||
|
||||
#else
|
||||
movq STACKSIZE + 8(%rsp), LDC
|
||||
#ifdef TRMMKERNEL
|
||||
movsd STACKSIZE + 16(%rsp), %xmm12
|
||||
vmovsd STACKSIZE + 16(%rsp), %xmm12
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -1372,6 +1373,8 @@
|
|||
|
||||
|
||||
.L999:
|
||||
vzeroupper
|
||||
|
||||
movq SP, %rsp
|
||||
movq (%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
|
@ -1383,16 +1386,16 @@
|
|||
#ifdef WINDOWS_ABI
|
||||
movq 48(%rsp), %rdi
|
||||
movq 56(%rsp), %rsi
|
||||
movups 64(%rsp), %xmm6
|
||||
movups 80(%rsp), %xmm7
|
||||
movups 96(%rsp), %xmm8
|
||||
movups 112(%rsp), %xmm9
|
||||
movups 128(%rsp), %xmm10
|
||||
movups 144(%rsp), %xmm11
|
||||
movups 160(%rsp), %xmm12
|
||||
movups 176(%rsp), %xmm13
|
||||
movups 192(%rsp), %xmm14
|
||||
movups 208(%rsp), %xmm15
|
||||
vmovups 64(%rsp), %xmm6
|
||||
vmovups 80(%rsp), %xmm7
|
||||
vmovups 96(%rsp), %xmm8
|
||||
vmovups 112(%rsp), %xmm9
|
||||
vmovups 128(%rsp), %xmm10
|
||||
vmovups 144(%rsp), %xmm11
|
||||
vmovups 160(%rsp), %xmm12
|
||||
vmovups 176(%rsp), %xmm13
|
||||
vmovups 192(%rsp), %xmm14
|
||||
vmovups 208(%rsp), %xmm15
|
||||
#endif
|
||||
|
||||
addq $STACKSIZE, %rsp
|
||||
|
|
|
@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
/*********************************************************************
|
||||
*
|
||||
* 2013/10/30 Saar
|
||||
* 2014/06/28 Saar
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -437,16 +437,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq %rdi, 48(%rsp)
|
||||
movq %rsi, 56(%rsp)
|
||||
movups %xmm6, 64(%rsp)
|
||||
movups %xmm7, 80(%rsp)
|
||||
movups %xmm8, 96(%rsp)
|
||||
movups %xmm9, 112(%rsp)
|
||||
movups %xmm10, 128(%rsp)
|
||||
movups %xmm11, 144(%rsp)
|
||||
movups %xmm12, 160(%rsp)
|
||||
movups %xmm13, 176(%rsp)
|
||||
movups %xmm14, 192(%rsp)
|
||||
movups %xmm15, 208(%rsp)
|
||||
vmovups %xmm6, 64(%rsp)
|
||||
vmovups %xmm7, 80(%rsp)
|
||||
vmovups %xmm8, 96(%rsp)
|
||||
vmovups %xmm9, 112(%rsp)
|
||||
vmovups %xmm10, 128(%rsp)
|
||||
vmovups %xmm11, 144(%rsp)
|
||||
vmovups %xmm12, 160(%rsp)
|
||||
vmovups %xmm13, 176(%rsp)
|
||||
vmovups %xmm14, 192(%rsp)
|
||||
vmovups %xmm15, 208(%rsp)
|
||||
|
||||
movq ARG1, OLD_M
|
||||
movq ARG2, OLD_N
|
||||
|
@ -456,14 +456,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
movq OLD_C, C
|
||||
movq OLD_LDC, LDC
|
||||
#ifdef TRMMKERNEL
|
||||
movsd OLD_OFFSET, %xmm12
|
||||
vmovsd OLD_OFFSET, %xmm12
|
||||
#endif
|
||||
vmovaps %xmm3, %xmm0
|
||||
vmovsd OLD_ALPHA_I, %xmm1
|
||||
|
||||
#else
|
||||
movq STACKSIZE + 8(%rsp), LDC
|
||||
#ifdef TRMMKERNEL
|
||||
movsd STACKSIZE + 16(%rsp), %xmm12
|
||||
vmovsd STACKSIZE + 16(%rsp), %xmm12
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -1397,6 +1398,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.L999:
|
||||
vzeroupper
|
||||
|
||||
movq SP, %rsp
|
||||
movq (%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
|
@ -1408,16 +1411,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq 48(%rsp), %rdi
|
||||
movq 56(%rsp), %rsi
|
||||
movups 64(%rsp), %xmm6
|
||||
movups 80(%rsp), %xmm7
|
||||
movups 96(%rsp), %xmm8
|
||||
movups 112(%rsp), %xmm9
|
||||
movups 128(%rsp), %xmm10
|
||||
movups 144(%rsp), %xmm11
|
||||
movups 160(%rsp), %xmm12
|
||||
movups 176(%rsp), %xmm13
|
||||
movups 192(%rsp), %xmm14
|
||||
movups 208(%rsp), %xmm15
|
||||
vmovups 64(%rsp), %xmm6
|
||||
vmovups 80(%rsp), %xmm7
|
||||
vmovups 96(%rsp), %xmm8
|
||||
vmovups 112(%rsp), %xmm9
|
||||
vmovups 128(%rsp), %xmm10
|
||||
vmovups 144(%rsp), %xmm11
|
||||
vmovups 160(%rsp), %xmm12
|
||||
vmovups 176(%rsp), %xmm13
|
||||
vmovups 192(%rsp), %xmm14
|
||||
vmovups 208(%rsp), %xmm15
|
||||
#endif
|
||||
|
||||
addq $STACKSIZE, %rsp
|
||||
|
|
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
**********************************************************************************/
|
||||
|
||||
/********************************************************************************
|
||||
* 2013/11/13 Saar
|
||||
* 2014/06/28 Saar
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -693,16 +693,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq %rdi, 48(%rsp)
|
||||
movq %rsi, 56(%rsp)
|
||||
movups %xmm6, 64(%rsp)
|
||||
movups %xmm7, 80(%rsp)
|
||||
movups %xmm8, 96(%rsp)
|
||||
movups %xmm9, 112(%rsp)
|
||||
movups %xmm10, 128(%rsp)
|
||||
movups %xmm11, 144(%rsp)
|
||||
movups %xmm12, 160(%rsp)
|
||||
movups %xmm13, 176(%rsp)
|
||||
movups %xmm14, 192(%rsp)
|
||||
movups %xmm15, 208(%rsp)
|
||||
vmovups %xmm6, 64(%rsp)
|
||||
vmovups %xmm7, 80(%rsp)
|
||||
vmovups %xmm8, 96(%rsp)
|
||||
vmovups %xmm9, 112(%rsp)
|
||||
vmovups %xmm10, 128(%rsp)
|
||||
vmovups %xmm11, 144(%rsp)
|
||||
vmovups %xmm12, 160(%rsp)
|
||||
vmovups %xmm13, 176(%rsp)
|
||||
vmovups %xmm14, 192(%rsp)
|
||||
vmovups %xmm15, 208(%rsp)
|
||||
|
||||
movq ARG1, OLD_M
|
||||
movq ARG2, OLD_N
|
||||
|
@ -715,6 +715,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
movsd OLD_OFFSET, %xmm12
|
||||
#endif
|
||||
vmovaps %xmm3, %xmm0
|
||||
vmovsd OLD_ALPHA_I, %xmm1
|
||||
|
||||
#else
|
||||
movq STACKSIZE + 8(%rsp), LDC
|
||||
|
@ -1781,6 +1782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
.L999:
|
||||
vzeroupper
|
||||
|
||||
movq SP, %rsp
|
||||
movq (%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
|
@ -1792,16 +1795,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef WINDOWS_ABI
|
||||
movq 48(%rsp), %rdi
|
||||
movq 56(%rsp), %rsi
|
||||
movups 64(%rsp), %xmm6
|
||||
movups 80(%rsp), %xmm7
|
||||
movups 96(%rsp), %xmm8
|
||||
movups 112(%rsp), %xmm9
|
||||
movups 128(%rsp), %xmm10
|
||||
movups 144(%rsp), %xmm11
|
||||
movups 160(%rsp), %xmm12
|
||||
movups 176(%rsp), %xmm13
|
||||
movups 192(%rsp), %xmm14
|
||||
movups 208(%rsp), %xmm15
|
||||
vmovups 64(%rsp), %xmm6
|
||||
vmovups 80(%rsp), %xmm7
|
||||
vmovups 96(%rsp), %xmm8
|
||||
vmovups 112(%rsp), %xmm9
|
||||
vmovups 128(%rsp), %xmm10
|
||||
vmovups 144(%rsp), %xmm11
|
||||
vmovups 160(%rsp), %xmm12
|
||||
vmovups 176(%rsp), %xmm13
|
||||
vmovups 192(%rsp), %xmm14
|
||||
vmovups 208(%rsp), %xmm15
|
||||
#endif
|
||||
|
||||
addq $ STACKSIZE, %rsp
|
||||
|
|
2
make.inc
2
make.inc
|
@ -1,7 +1,7 @@
|
|||
SHELL = /bin/sh
|
||||
PLAT = _LINUX
|
||||
DRVOPTS = $(OPTS)
|
||||
LOADER = $(FORTRAN) -pthread
|
||||
LOADER = $(FORTRAN)
|
||||
ARCHFLAGS= -ru
|
||||
#RANLIB = ranlib
|
||||
|
||||
|
|
8
param.h
8
param.h
|
@ -1111,14 +1111,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define XGEMM_DEFAULT_UNROLL_N 1
|
||||
#endif
|
||||
|
@ -1134,7 +1134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define QGEMM_DEFAULT_P 504
|
||||
#define QGEMM_DEFAULT_R qgemm_r
|
||||
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define CGEMM_DEFAULT_P 384
|
||||
//#define CGEMM_DEFAULT_R cgemm_r
|
||||
#define CGEMM_DEFAULT_R 1024
|
||||
|
||||
|
@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SGEMM_DEFAULT_Q 384
|
||||
#define DGEMM_DEFAULT_Q 256
|
||||
#define QGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 256
|
||||
#define CGEMM_DEFAULT_Q 192
|
||||
#define ZGEMM_DEFAULT_Q 192
|
||||
#define XGEMM_DEFAULT_Q 128
|
||||
|
||||
|
|
Loading…
Reference in New Issue