diff --git a/Changelog.txt b/Changelog.txt index b3c438471..a90d39f06 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -4,6 +4,8 @@ Version 0.1 (in development) 26-Feb-2011 common: + * Support "make NO_LAPACK=1" to build the library without + LAPACK functions. * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34. Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github) * Added DEBUG=1 rule in Makefile.rule to build debug version. @@ -13,6 +15,9 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86 64: + * On x86 32bits, gcc 4.4.3 generated wrong codes (movsd) from movlps + in zdot_sse2.S line 191. This would casue zdotu & zdotc failures. + Instead,Walk around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) * Modified ?swap functions to return same netlib BLAS results diff --git a/Makefile b/Makefile index 8eb89ed87..52f649f77 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,10 @@ ifdef SANITY_CHECK BLASDIRS += reference endif -SUBDIRS = $(BLASDIRS) lapack +SUBDIRS = $(BLASDIRS) +ifneq ($(NO_LAPACK), 1) +SUBDIRS += lapack +endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench @@ -149,10 +152,15 @@ hpl_p : fi; \ done +ifeq ($(NO_LAPACK), 1) +netlib : + +else netlib : lapack-3.1.1 patch.for_lapack-3.1.1 lapack-3.1.1/make.inc ifndef NOFORTRAN -@$(MAKE) -C lapack-3.1.1 lapacklib endif +endif prof_lapack : lapack-3.1.1 lapack-3.1.1/make.inc -@$(MAKE) -C lapack-3.1.1 lapack_prof diff --git a/Makefile.rule b/Makefile.rule index d9013dd83..12b9e347d 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -43,6 +43,9 @@ VERSION = 0.1 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 +# If you don't need LAPACK, please comment it in. +# NO_LAPACK = 1 + # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 diff --git a/exports/Makefile b/exports/Makefile index 00e6fed46..24cdc41c8 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -75,13 +75,13 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) libgoto2.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) libgoto2_shared.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) $(LIBDYNNAME) : ../$(LIBNAME) osx.def $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) @@ -162,23 +162,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) linux.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) symbol.S : gensymbol - perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) > symbol.S + perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > symbol.S test : linktest.c $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* diff --git a/exports/gensymbol b/exports/gensymbol index 8455e51b6..3d8d74dde 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -298,7 +298,10 @@ zcgesv, zlag2c, clag2z, ); -if (-d "../lapack-3.1.1") { +if ($ARGV[5] == 1) { + #NO_LAPACK=1 + @objs = (@blasobjs); +} elsif (-d "../lapack-3.1.1") { @objs = (@blasobjs, @lapackobjs, @lapackobjs2); } else { @objs = (@blasobjs, @lapackobjs); diff --git a/interface/Makefile b/interface/Makefile index 5bfc5f389..6764daa95 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -342,36 +342,45 @@ CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) -SBLASOBJS += \ +SLAPACKOBJS = \ sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \ spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \ slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \ -DBLASOBJS += \ +DLAPACKOBJS = \ dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \ dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \ dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \ -QBLASOBJS += \ +QLAPACKOBJS = \ qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ -CBLASOBJS += \ +CLAPACKOBJS = \ cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \ cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \ claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \ -ZBLASOBJS += \ +ZLAPACKOBJS = \ zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \ zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \ zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \ -XBLASOBJS += \ +XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \ xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ +ifneq ($(NO_LAPACK), 1) +SBLASOBJS += $(SLAPACKOBJS) +DBLASOBJS += $(DLAPACKOBJS) +QBLASOBJS += $(QLAPACKOBJS) +CBLASOBJS += $(CLAPACKOBJS) +ZBLASOBJS += $(ZLAPACKOBJS) +XBLASOBJS += $(XLAPACKOBJS) + +endif FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) diff --git a/kernel/Makefile.LA b/kernel/Makefile.LA index 496d05cf6..88342718c 100644 --- a/kernel/Makefile.LA +++ b/kernel/Makefile.LA @@ -1,3 +1,4 @@ +ifneq ($(NO_LAPACK), 1) SBLASOBJS += sneg_tcopy$(TSUFFIX).$(SUFFIX) slaswp_ncopy$(TSUFFIX).$(SUFFIX) DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) @@ -10,6 +11,8 @@ ZBLASOBJS += zneg_tcopy$(TSUFFIX).$(SUFFIX) zlaswp_ncopy$(TSUFFIX).$(SUFFIX) XBLASOBJS += xneg_tcopy$(TSUFFIX).$(SUFFIX) xlaswp_ncopy$(TSUFFIX).$(SUFFIX) +endif + $(KDIR)sneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)sneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $< -o $@ diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S index 6304f01a7..5aeefde31 100644 --- a/kernel/x86/zdot_sse2.S +++ b/kernel/x86/zdot_sse2.S @@ -60,7 +60,9 @@ #undef movsd #ifndef OPTERON -#define movlps movsd +#define MOVLPS movsd +#else +#define MOVLPS movlps #endif PROLOGUE @@ -351,11 +353,11 @@ sarl $3, %eax jle .L25 - movlps -16 * SIZE(X), %xmm4 + MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 - movlps -14 * SIZE(X), %xmm5 + MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 @@ -373,7 +375,7 @@ addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps -12 * SIZE(X), %xmm4 + MOVLPS -12 * SIZE(X), %xmm4 movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -382,7 +384,7 @@ addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps -10 * SIZE(X), %xmm5 + MOVLPS -10 * SIZE(X), %xmm5 movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -395,7 +397,7 @@ addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps -8 * SIZE(X), %xmm4 + MOVLPS -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -404,7 +406,7 @@ addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps -6 * SIZE(X), %xmm5 + MOVLPS -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -417,7 +419,7 @@ addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps -4 * SIZE(X), %xmm4 + MOVLPS -4 * SIZE(X), %xmm4 movhps -3 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -426,7 +428,7 @@ addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps -2 * SIZE(X), %xmm5 + MOVLPS -2 * SIZE(X), %xmm5 movhps -1 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -439,7 +441,7 @@ addpd %xmm6, %xmm0 movaps 0 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -448,7 +450,7 @@ addpd %xmm7, %xmm0 movaps 2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps 2 * SIZE(X), %xmm5 + MOVLPS 2 * SIZE(X), %xmm5 movhps 3 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -465,7 +467,7 @@ addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps -12 * SIZE(X), %xmm4 + MOVLPS -12 * SIZE(X), %xmm4 movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -474,7 +476,7 @@ addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps -10 * SIZE(X), %xmm5 + MOVLPS -10 * SIZE(X), %xmm5 movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -483,7 +485,7 @@ addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps -8 * SIZE(X), %xmm4 + MOVLPS -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -492,7 +494,7 @@ addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps -6 * SIZE(X), %xmm5 + MOVLPS -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -501,7 +503,7 @@ addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps -4 * SIZE(X), %xmm4 + MOVLPS -4 * SIZE(X), %xmm4 movhps -3 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -510,7 +512,7 @@ addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps -2 * SIZE(X), %xmm5 + MOVLPS -2 * SIZE(X), %xmm5 movhps -1 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -534,11 +536,11 @@ testl $4, N jle .L26 - movlps -16 * SIZE(X), %xmm4 + MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 - movlps -14 * SIZE(X), %xmm5 + MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 @@ -547,7 +549,7 @@ addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 - movlps -12 * SIZE(X), %xmm4 + MOVLPS -12 * SIZE(X), %xmm4 movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 @@ -556,7 +558,7 @@ addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 - movlps -10 * SIZE(X), %xmm5 + MOVLPS -10 * SIZE(X), %xmm5 movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 @@ -580,7 +582,7 @@ testl $2, N jle .L27 - movlps -16 * SIZE(X), %xmm4 + MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 @@ -590,7 +592,7 @@ mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - movlps -14 * SIZE(X), %xmm5 + MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 @@ -608,7 +610,7 @@ testl $1, N jle .L98 - movlps -16 * SIZE(X), %xmm4 + MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 @@ -628,11 +630,11 @@ sarl $3, %eax jle .L35 - movlps -16 * SIZE(Y), %xmm4 + MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 - movlps -14 * SIZE(Y), %xmm5 + MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 @@ -650,7 +652,7 @@ addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps -12 * SIZE(Y), %xmm4 + MOVLPS -12 * SIZE(Y), %xmm4 movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -659,7 +661,7 @@ addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps -10 * SIZE(Y), %xmm5 + MOVLPS -10 * SIZE(Y), %xmm5 movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -671,7 +673,7 @@ addpd %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps -8 * SIZE(Y), %xmm4 + MOVLPS -8 * SIZE(Y), %xmm4 movhps -7 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -680,7 +682,7 @@ addpd %xmm7, %xmm0 movaps -6 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps -6 * SIZE(Y), %xmm5 + MOVLPS -6 * SIZE(Y), %xmm5 movhps -5 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -693,7 +695,7 @@ addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps -4 * SIZE(Y), %xmm4 + MOVLPS -4 * SIZE(Y), %xmm4 movhps -3 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -702,7 +704,7 @@ addpd %xmm7, %xmm0 movaps -2 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps -2 * SIZE(Y), %xmm5 + MOVLPS -2 * SIZE(Y), %xmm5 movhps -1 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -715,7 +717,7 @@ addpd %xmm6, %xmm0 movaps 0 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps 0 * SIZE(Y), %xmm4 + MOVLPS 0 * SIZE(Y), %xmm4 movhps 1 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -724,7 +726,7 @@ addpd %xmm7, %xmm0 movaps 2 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps 2 * SIZE(Y), %xmm5 + MOVLPS 2 * SIZE(Y), %xmm5 movhps 3 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -741,7 +743,7 @@ addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps -12 * SIZE(Y), %xmm4 + MOVLPS -12 * SIZE(Y), %xmm4 movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -750,7 +752,7 @@ addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps -10 * SIZE(Y), %xmm5 + MOVLPS -10 * SIZE(Y), %xmm5 movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -759,7 +761,7 @@ addpd %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps -8 * SIZE(Y), %xmm4 + MOVLPS -8 * SIZE(Y), %xmm4 movhps -7 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -768,7 +770,7 @@ addpd %xmm7, %xmm0 movaps -6 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps -6 * SIZE(Y), %xmm5 + MOVLPS -6 * SIZE(Y), %xmm5 movhps -5 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -777,7 +779,7 @@ addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps -4 * SIZE(Y), %xmm4 + MOVLPS -4 * SIZE(Y), %xmm4 movhps -3 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -786,7 +788,7 @@ addpd %xmm7, %xmm0 movaps -2 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps -2 * SIZE(Y), %xmm5 + MOVLPS -2 * SIZE(Y), %xmm5 movhps -1 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -810,11 +812,11 @@ testl $4, N jle .L36 - movlps -16 * SIZE(Y), %xmm4 + MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 - movlps -14 * SIZE(Y), %xmm5 + MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 @@ -823,7 +825,7 @@ addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 - movlps -12 * SIZE(Y), %xmm4 + MOVLPS -12 * SIZE(Y), %xmm4 movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 @@ -832,7 +834,7 @@ addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 - movlps -10 * SIZE(Y), %xmm5 + MOVLPS -10 * SIZE(Y), %xmm5 movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 @@ -856,7 +858,7 @@ testl $2, N jle .L37 - movlps -16 * SIZE(Y), %xmm4 + MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 @@ -866,7 +868,7 @@ mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - movlps -14 * SIZE(Y), %xmm5 + MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 @@ -887,7 +889,7 @@ testl $1, N jle .L98 - movlps -16 * SIZE(Y), %xmm4 + MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 @@ -1188,8 +1190,8 @@ testl $1, N jle .L48 - movlps -16 * SIZE(X), %xmm4 - movlps -16 * SIZE(Y), %xmm6 + movlpd -16 * SIZE(X), %xmm4 + movlpd -16 * SIZE(Y), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 @@ -1211,17 +1213,17 @@ sarl $3, %eax jle .L55 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y @@ -1233,11 +1235,11 @@ pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1245,11 +1247,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1257,11 +1259,11 @@ pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1269,11 +1271,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1281,11 +1283,11 @@ pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1293,11 +1295,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1305,11 +1307,11 @@ pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1317,11 +1319,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1334,11 +1336,11 @@ pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1346,11 +1348,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1358,11 +1360,11 @@ pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1370,11 +1372,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1382,11 +1384,11 @@ pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1394,11 +1396,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1420,28 +1422,28 @@ testl $4, N jle .L56 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 @@ -1449,11 +1451,11 @@ pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 @@ -1475,10 +1477,10 @@ testl $2, N jle .L57 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y @@ -1488,10 +1490,10 @@ mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - movlps 0 * SIZE(X), %xmm5 + MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X - movlps 0 * SIZE(Y), %xmm7 + MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y @@ -1506,9 +1508,9 @@ testl $1, N jle .L98 - movlps 0 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 - movlps 0 * SIZE(Y), %xmm6 + MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 pshufd $0x4e, %xmm6, %xmm3 @@ -1533,8 +1535,8 @@ .L999: movl RESULT, %eax - movlps %xmm0, 0 * SIZE(%eax) - movlps %xmm1, 1 * SIZE(%eax) + MOVLPS %xmm0, 0 * SIZE(%eax) + MOVLPS %xmm1, 1 * SIZE(%eax) popl %ebx popl %esi diff --git a/utest/common_utest.h b/utest/common_utest.h index 613003307..3e9ecb422 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -55,5 +55,6 @@ void test_saxpy_inc_0(void); void test_caxpy_inc_0(void); void test_zdotu_n_1(void); +void test_zdotu_offset_1(void); #endif diff --git a/utest/main.c b/utest/main.c index c6fbd48e2..f6ecf3cc0 100644 --- a/utest/main.c +++ b/utest/main.c @@ -53,6 +53,8 @@ CU_TestInfo test_level1[]={ {"Testing zaxpy with incx || incy == 0",test_zaxpy_inc_0}, {"Testing zdotu with n == 1",test_zdotu_n_1}, + {"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1}, + CU_TEST_INFO_NULL, }; diff --git a/utest/test_dotu.c b/utest/test_dotu.c index bb720c85a..60bb3a6da 100644 --- a/utest/test_dotu.c +++ b/utest/test_dotu.c @@ -53,4 +53,23 @@ void test_zdotu_n_1(void) } +void test_zdotu_offset_1(void) +{ + int N=1,incX=1,incY=1; + double x1[]={1.0,2.0,3.0,4.0}; + double y1[]={5.0,6.0,7.0,8.0}; + double x2[]={1.0,2.0,3.0,4.0}; + double y2[]={5.0,6.0,7.0,8.0}; + double _Complex result1=0.0; + double _Complex result2=0.0; + //OpenBLAS + result1=BLASFUNC(zdotu)(&N,x1+1,&incX,y1+1,&incY); + //reference + result2=BLASFUNC_REF(zdotu)(&N,x2+1,&incX,y2+1,&incY); + + CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); +// printf("\%lf,%lf\n",creal(result1),cimag(result1)); + +}