Merge commit 'origin/x86' into loongson3a
This commit is contained in:
commit
88cbfcc5b5
|
@ -4,6 +4,8 @@ Version 0.1 (in development)
|
||||||
26-Feb-2011
|
26-Feb-2011
|
||||||
|
|
||||||
common:
|
common:
|
||||||
|
* Support "make NO_LAPACK=1" to build the library without
|
||||||
|
LAPACK functions.
|
||||||
* Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34.
|
* Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34.
|
||||||
Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github)
|
Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github)
|
||||||
* Added DEBUG=1 rule in Makefile.rule to build debug version.
|
* Added DEBUG=1 rule in Makefile.rule to build debug version.
|
||||||
|
@ -13,6 +15,9 @@ common:
|
||||||
* Imported GotoBLAS2 1.13 BSD version
|
* Imported GotoBLAS2 1.13 BSD version
|
||||||
|
|
||||||
x86/x86 64:
|
x86/x86 64:
|
||||||
|
* On x86 32bits, gcc 4.4.3 generated wrong codes (movsd) from movlps
|
||||||
|
in zdot_sse2.S line 191. This would casue zdotu & zdotc failures.
|
||||||
|
Instead,Walk around it. (Refs issue #8 #9 on github)
|
||||||
* Modified ?axpy functions to return same netlib BLAS results
|
* Modified ?axpy functions to return same netlib BLAS results
|
||||||
when incx==0 or incy==0 (Refs issue #7 on github)
|
when incx==0 or incy==0 (Refs issue #7 on github)
|
||||||
* Modified ?swap functions to return same netlib BLAS results
|
* Modified ?swap functions to return same netlib BLAS results
|
||||||
|
|
10
Makefile
10
Makefile
|
@ -15,7 +15,10 @@ ifdef SANITY_CHECK
|
||||||
BLASDIRS += reference
|
BLASDIRS += reference
|
||||||
endif
|
endif
|
||||||
|
|
||||||
SUBDIRS = $(BLASDIRS) lapack
|
SUBDIRS = $(BLASDIRS)
|
||||||
|
ifneq ($(NO_LAPACK), 1)
|
||||||
|
SUBDIRS += lapack
|
||||||
|
endif
|
||||||
|
|
||||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||||
|
|
||||||
|
@ -149,10 +152,15 @@ hpl_p :
|
||||||
fi; \
|
fi; \
|
||||||
done
|
done
|
||||||
|
|
||||||
|
ifeq ($(NO_LAPACK), 1)
|
||||||
|
netlib :
|
||||||
|
|
||||||
|
else
|
||||||
netlib : lapack-3.1.1 patch.for_lapack-3.1.1 lapack-3.1.1/make.inc
|
netlib : lapack-3.1.1 patch.for_lapack-3.1.1 lapack-3.1.1/make.inc
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
-@$(MAKE) -C lapack-3.1.1 lapacklib
|
-@$(MAKE) -C lapack-3.1.1 lapacklib
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
prof_lapack : lapack-3.1.1 lapack-3.1.1/make.inc
|
prof_lapack : lapack-3.1.1 lapack-3.1.1/make.inc
|
||||||
-@$(MAKE) -C lapack-3.1.1 lapack_prof
|
-@$(MAKE) -C lapack-3.1.1 lapack_prof
|
||||||
|
|
|
@ -43,6 +43,9 @@ VERSION = 0.1
|
||||||
# If you don't need CBLAS interface, please comment it in.
|
# If you don't need CBLAS interface, please comment it in.
|
||||||
# NO_CBLAS = 1
|
# NO_CBLAS = 1
|
||||||
|
|
||||||
|
# If you don't need LAPACK, please comment it in.
|
||||||
|
# NO_LAPACK = 1
|
||||||
|
|
||||||
# If you want to use legacy threaded Level 3 implementation.
|
# If you want to use legacy threaded Level 3 implementation.
|
||||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||||
|
|
||||||
|
|
|
@ -75,13 +75,13 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
|
||||||
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
|
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
|
||||||
|
|
||||||
libgoto2.def : gensymbol
|
libgoto2.def : gensymbol
|
||||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F)
|
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||||
|
|
||||||
libgoto2_shared.def : gensymbol
|
libgoto2_shared.def : gensymbol
|
||||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F)
|
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||||
|
|
||||||
libgoto_hpl.def : gensymbol
|
libgoto_hpl.def : gensymbol
|
||||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F)
|
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||||
|
|
||||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||||
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
|
@ -162,23 +162,23 @@ static : ../$(LIBNAME)
|
||||||
rm -f goto.$(SUFFIX)
|
rm -f goto.$(SUFFIX)
|
||||||
|
|
||||||
linux.def : gensymbol ../Makefile.system ../getarch.c
|
linux.def : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F)
|
perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||||
|
|
||||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F)
|
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||||
|
|
||||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F)
|
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||||
|
|
||||||
symbol.S : gensymbol
|
symbol.S : gensymbol
|
||||||
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) > symbol.S
|
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > symbol.S
|
||||||
|
|
||||||
test : linktest.c
|
test : linktest.c
|
||||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > linktest.c
|
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > linktest.c
|
||||||
|
|
||||||
clean ::
|
clean ::
|
||||||
@rm -f *.def *.dylib __.SYMDEF*
|
@rm -f *.def *.dylib __.SYMDEF*
|
||||||
|
|
|
@ -298,7 +298,10 @@
|
||||||
zcgesv, zlag2c, clag2z,
|
zcgesv, zlag2c, clag2z,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (-d "../lapack-3.1.1") {
|
if ($ARGV[5] == 1) {
|
||||||
|
#NO_LAPACK=1
|
||||||
|
@objs = (@blasobjs);
|
||||||
|
} elsif (-d "../lapack-3.1.1") {
|
||||||
@objs = (@blasobjs, @lapackobjs, @lapackobjs2);
|
@objs = (@blasobjs, @lapackobjs, @lapackobjs2);
|
||||||
} else {
|
} else {
|
||||||
@objs = (@blasobjs, @lapackobjs);
|
@objs = (@blasobjs, @lapackobjs);
|
||||||
|
|
|
@ -342,36 +342,45 @@ CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
|
||||||
ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS)
|
ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS)
|
||||||
XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
|
XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
|
||||||
|
|
||||||
SBLASOBJS += \
|
SLAPACKOBJS = \
|
||||||
sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \
|
sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \
|
||||||
spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \
|
spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \
|
||||||
slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \
|
slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \
|
||||||
|
|
||||||
DBLASOBJS += \
|
DLAPACKOBJS = \
|
||||||
dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \
|
dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \
|
||||||
dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \
|
dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \
|
||||||
dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \
|
dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \
|
||||||
|
|
||||||
QBLASOBJS += \
|
QLAPACKOBJS = \
|
||||||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
|
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
|
||||||
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
|
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
|
||||||
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||||
|
|
||||||
CBLASOBJS += \
|
CLAPACKOBJS = \
|
||||||
cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \
|
cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \
|
||||||
cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \
|
cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \
|
||||||
claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \
|
claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \
|
||||||
|
|
||||||
ZBLASOBJS += \
|
ZLAPACKOBJS = \
|
||||||
zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \
|
zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \
|
||||||
zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \
|
zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \
|
||||||
zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \
|
zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \
|
||||||
|
|
||||||
XBLASOBJS += \
|
XLAPACKOBJS = \
|
||||||
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
|
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
|
||||||
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
|
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
|
||||||
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||||
|
|
||||||
|
ifneq ($(NO_LAPACK), 1)
|
||||||
|
SBLASOBJS += $(SLAPACKOBJS)
|
||||||
|
DBLASOBJS += $(DLAPACKOBJS)
|
||||||
|
QBLASOBJS += $(QLAPACKOBJS)
|
||||||
|
CBLASOBJS += $(CLAPACKOBJS)
|
||||||
|
ZBLASOBJS += $(ZLAPACKOBJS)
|
||||||
|
XBLASOBJS += $(XLAPACKOBJS)
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
ifneq ($(NO_LAPACK), 1)
|
||||||
SBLASOBJS += sneg_tcopy$(TSUFFIX).$(SUFFIX) slaswp_ncopy$(TSUFFIX).$(SUFFIX)
|
SBLASOBJS += sneg_tcopy$(TSUFFIX).$(SUFFIX) slaswp_ncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX)
|
DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
@ -10,6 +11,8 @@ ZBLASOBJS += zneg_tcopy$(TSUFFIX).$(SUFFIX) zlaswp_ncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
XBLASOBJS += xneg_tcopy$(TSUFFIX).$(SUFFIX) xlaswp_ncopy$(TSUFFIX).$(SUFFIX)
|
XBLASOBJS += xneg_tcopy$(TSUFFIX).$(SUFFIX) xlaswp_ncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
$(KDIR)sneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)sneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)sneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)sneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(SGEMM_UNROLL_M).c
|
||||||
$(CC) -c $(CFLAGS) $< -o $@
|
$(CC) -c $(CFLAGS) $< -o $@
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,9 @@
|
||||||
#undef movsd
|
#undef movsd
|
||||||
|
|
||||||
#ifndef OPTERON
|
#ifndef OPTERON
|
||||||
#define movlps movsd
|
#define MOVLPS movsd
|
||||||
|
#else
|
||||||
|
#define MOVLPS movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
@ -351,11 +353,11 @@
|
||||||
sarl $3, %eax
|
sarl $3, %eax
|
||||||
jle .L25
|
jle .L25
|
||||||
|
|
||||||
movlps -16 * SIZE(X), %xmm4
|
MOVLPS -16 * SIZE(X), %xmm4
|
||||||
movhps -15 * SIZE(X), %xmm4
|
movhps -15 * SIZE(X), %xmm4
|
||||||
movaps -16 * SIZE(Y), %xmm6
|
movaps -16 * SIZE(Y), %xmm6
|
||||||
|
|
||||||
movlps -14 * SIZE(X), %xmm5
|
MOVLPS -14 * SIZE(X), %xmm5
|
||||||
movhps -13 * SIZE(X), %xmm5
|
movhps -13 * SIZE(X), %xmm5
|
||||||
movaps -14 * SIZE(Y), %xmm7
|
movaps -14 * SIZE(Y), %xmm7
|
||||||
|
|
||||||
|
@ -373,7 +375,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -12 * SIZE(Y), %xmm6
|
movaps -12 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -12 * SIZE(X), %xmm4
|
MOVLPS -12 * SIZE(X), %xmm4
|
||||||
movhps -11 * SIZE(X), %xmm4
|
movhps -11 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -382,7 +384,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -10 * SIZE(Y), %xmm7
|
movaps -10 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -10 * SIZE(X), %xmm5
|
MOVLPS -10 * SIZE(X), %xmm5
|
||||||
movhps -9 * SIZE(X), %xmm5
|
movhps -9 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -395,7 +397,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -8 * SIZE(Y), %xmm6
|
movaps -8 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -8 * SIZE(X), %xmm4
|
MOVLPS -8 * SIZE(X), %xmm4
|
||||||
movhps -7 * SIZE(X), %xmm4
|
movhps -7 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -404,7 +406,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -6 * SIZE(Y), %xmm7
|
movaps -6 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -6 * SIZE(X), %xmm5
|
MOVLPS -6 * SIZE(X), %xmm5
|
||||||
movhps -5 * SIZE(X), %xmm5
|
movhps -5 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -417,7 +419,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -4 * SIZE(Y), %xmm6
|
movaps -4 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -4 * SIZE(X), %xmm4
|
MOVLPS -4 * SIZE(X), %xmm4
|
||||||
movhps -3 * SIZE(X), %xmm4
|
movhps -3 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -426,7 +428,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -2 * SIZE(Y), %xmm7
|
movaps -2 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -2 * SIZE(X), %xmm5
|
MOVLPS -2 * SIZE(X), %xmm5
|
||||||
movhps -1 * SIZE(X), %xmm5
|
movhps -1 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -439,7 +441,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps 0 * SIZE(Y), %xmm6
|
movaps 0 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -448,7 +450,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps 2 * SIZE(Y), %xmm7
|
movaps 2 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 2 * SIZE(X), %xmm5
|
MOVLPS 2 * SIZE(X), %xmm5
|
||||||
movhps 3 * SIZE(X), %xmm5
|
movhps 3 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -465,7 +467,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -12 * SIZE(Y), %xmm6
|
movaps -12 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -12 * SIZE(X), %xmm4
|
MOVLPS -12 * SIZE(X), %xmm4
|
||||||
movhps -11 * SIZE(X), %xmm4
|
movhps -11 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -474,7 +476,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -10 * SIZE(Y), %xmm7
|
movaps -10 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -10 * SIZE(X), %xmm5
|
MOVLPS -10 * SIZE(X), %xmm5
|
||||||
movhps -9 * SIZE(X), %xmm5
|
movhps -9 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -483,7 +485,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -8 * SIZE(Y), %xmm6
|
movaps -8 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -8 * SIZE(X), %xmm4
|
MOVLPS -8 * SIZE(X), %xmm4
|
||||||
movhps -7 * SIZE(X), %xmm4
|
movhps -7 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -492,7 +494,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -6 * SIZE(Y), %xmm7
|
movaps -6 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -6 * SIZE(X), %xmm5
|
MOVLPS -6 * SIZE(X), %xmm5
|
||||||
movhps -5 * SIZE(X), %xmm5
|
movhps -5 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -501,7 +503,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -4 * SIZE(Y), %xmm6
|
movaps -4 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -4 * SIZE(X), %xmm4
|
MOVLPS -4 * SIZE(X), %xmm4
|
||||||
movhps -3 * SIZE(X), %xmm4
|
movhps -3 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -510,7 +512,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -2 * SIZE(Y), %xmm7
|
movaps -2 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -2 * SIZE(X), %xmm5
|
MOVLPS -2 * SIZE(X), %xmm5
|
||||||
movhps -1 * SIZE(X), %xmm5
|
movhps -1 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -534,11 +536,11 @@
|
||||||
testl $4, N
|
testl $4, N
|
||||||
jle .L26
|
jle .L26
|
||||||
|
|
||||||
movlps -16 * SIZE(X), %xmm4
|
MOVLPS -16 * SIZE(X), %xmm4
|
||||||
movhps -15 * SIZE(X), %xmm4
|
movhps -15 * SIZE(X), %xmm4
|
||||||
movaps -16 * SIZE(Y), %xmm6
|
movaps -16 * SIZE(Y), %xmm6
|
||||||
|
|
||||||
movlps -14 * SIZE(X), %xmm5
|
MOVLPS -14 * SIZE(X), %xmm5
|
||||||
movhps -13 * SIZE(X), %xmm5
|
movhps -13 * SIZE(X), %xmm5
|
||||||
movaps -14 * SIZE(Y), %xmm7
|
movaps -14 * SIZE(Y), %xmm7
|
||||||
|
|
||||||
|
@ -547,7 +549,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -12 * SIZE(Y), %xmm6
|
movaps -12 * SIZE(Y), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -12 * SIZE(X), %xmm4
|
MOVLPS -12 * SIZE(X), %xmm4
|
||||||
movhps -11 * SIZE(X), %xmm4
|
movhps -11 * SIZE(X), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -556,7 +558,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -10 * SIZE(Y), %xmm7
|
movaps -10 * SIZE(Y), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -10 * SIZE(X), %xmm5
|
MOVLPS -10 * SIZE(X), %xmm5
|
||||||
movhps -9 * SIZE(X), %xmm5
|
movhps -9 * SIZE(X), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -580,7 +582,7 @@
|
||||||
testl $2, N
|
testl $2, N
|
||||||
jle .L27
|
jle .L27
|
||||||
|
|
||||||
movlps -16 * SIZE(X), %xmm4
|
MOVLPS -16 * SIZE(X), %xmm4
|
||||||
movhps -15 * SIZE(X), %xmm4
|
movhps -15 * SIZE(X), %xmm4
|
||||||
movaps -16 * SIZE(Y), %xmm6
|
movaps -16 * SIZE(Y), %xmm6
|
||||||
|
|
||||||
|
@ -590,7 +592,7 @@
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
movlps -14 * SIZE(X), %xmm5
|
MOVLPS -14 * SIZE(X), %xmm5
|
||||||
movhps -13 * SIZE(X), %xmm5
|
movhps -13 * SIZE(X), %xmm5
|
||||||
movaps -14 * SIZE(Y), %xmm7
|
movaps -14 * SIZE(Y), %xmm7
|
||||||
|
|
||||||
|
@ -608,7 +610,7 @@
|
||||||
testl $1, N
|
testl $1, N
|
||||||
jle .L98
|
jle .L98
|
||||||
|
|
||||||
movlps -16 * SIZE(X), %xmm4
|
MOVLPS -16 * SIZE(X), %xmm4
|
||||||
movhps -15 * SIZE(X), %xmm4
|
movhps -15 * SIZE(X), %xmm4
|
||||||
movaps -16 * SIZE(Y), %xmm6
|
movaps -16 * SIZE(Y), %xmm6
|
||||||
|
|
||||||
|
@ -628,11 +630,11 @@
|
||||||
sarl $3, %eax
|
sarl $3, %eax
|
||||||
jle .L35
|
jle .L35
|
||||||
|
|
||||||
movlps -16 * SIZE(Y), %xmm4
|
MOVLPS -16 * SIZE(Y), %xmm4
|
||||||
movhps -15 * SIZE(Y), %xmm4
|
movhps -15 * SIZE(Y), %xmm4
|
||||||
movaps -16 * SIZE(X), %xmm6
|
movaps -16 * SIZE(X), %xmm6
|
||||||
|
|
||||||
movlps -14 * SIZE(Y), %xmm5
|
MOVLPS -14 * SIZE(Y), %xmm5
|
||||||
movhps -13 * SIZE(Y), %xmm5
|
movhps -13 * SIZE(Y), %xmm5
|
||||||
movaps -14 * SIZE(X), %xmm7
|
movaps -14 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
@ -650,7 +652,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -12 * SIZE(X), %xmm6
|
movaps -12 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -12 * SIZE(Y), %xmm4
|
MOVLPS -12 * SIZE(Y), %xmm4
|
||||||
movhps -11 * SIZE(Y), %xmm4
|
movhps -11 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -659,7 +661,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -10 * SIZE(X), %xmm7
|
movaps -10 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -10 * SIZE(Y), %xmm5
|
MOVLPS -10 * SIZE(Y), %xmm5
|
||||||
movhps -9 * SIZE(Y), %xmm5
|
movhps -9 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -671,7 +673,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -8 * SIZE(X), %xmm6
|
movaps -8 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -8 * SIZE(Y), %xmm4
|
MOVLPS -8 * SIZE(Y), %xmm4
|
||||||
movhps -7 * SIZE(Y), %xmm4
|
movhps -7 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -680,7 +682,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -6 * SIZE(X), %xmm7
|
movaps -6 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -6 * SIZE(Y), %xmm5
|
MOVLPS -6 * SIZE(Y), %xmm5
|
||||||
movhps -5 * SIZE(Y), %xmm5
|
movhps -5 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -693,7 +695,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -4 * SIZE(X), %xmm6
|
movaps -4 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -4 * SIZE(Y), %xmm4
|
MOVLPS -4 * SIZE(Y), %xmm4
|
||||||
movhps -3 * SIZE(Y), %xmm4
|
movhps -3 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -702,7 +704,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -2 * SIZE(X), %xmm7
|
movaps -2 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -2 * SIZE(Y), %xmm5
|
MOVLPS -2 * SIZE(Y), %xmm5
|
||||||
movhps -1 * SIZE(Y), %xmm5
|
movhps -1 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -715,7 +717,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps 0 * SIZE(X), %xmm6
|
movaps 0 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(Y), %xmm4
|
MOVLPS 0 * SIZE(Y), %xmm4
|
||||||
movhps 1 * SIZE(Y), %xmm4
|
movhps 1 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -724,7 +726,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps 2 * SIZE(X), %xmm7
|
movaps 2 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 2 * SIZE(Y), %xmm5
|
MOVLPS 2 * SIZE(Y), %xmm5
|
||||||
movhps 3 * SIZE(Y), %xmm5
|
movhps 3 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -741,7 +743,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -12 * SIZE(X), %xmm6
|
movaps -12 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -12 * SIZE(Y), %xmm4
|
MOVLPS -12 * SIZE(Y), %xmm4
|
||||||
movhps -11 * SIZE(Y), %xmm4
|
movhps -11 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -750,7 +752,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -10 * SIZE(X), %xmm7
|
movaps -10 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -10 * SIZE(Y), %xmm5
|
MOVLPS -10 * SIZE(Y), %xmm5
|
||||||
movhps -9 * SIZE(Y), %xmm5
|
movhps -9 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -759,7 +761,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -8 * SIZE(X), %xmm6
|
movaps -8 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -8 * SIZE(Y), %xmm4
|
MOVLPS -8 * SIZE(Y), %xmm4
|
||||||
movhps -7 * SIZE(Y), %xmm4
|
movhps -7 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -768,7 +770,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -6 * SIZE(X), %xmm7
|
movaps -6 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -6 * SIZE(Y), %xmm5
|
MOVLPS -6 * SIZE(Y), %xmm5
|
||||||
movhps -5 * SIZE(Y), %xmm5
|
movhps -5 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -777,7 +779,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -4 * SIZE(X), %xmm6
|
movaps -4 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -4 * SIZE(Y), %xmm4
|
MOVLPS -4 * SIZE(Y), %xmm4
|
||||||
movhps -3 * SIZE(Y), %xmm4
|
movhps -3 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -786,7 +788,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -2 * SIZE(X), %xmm7
|
movaps -2 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -2 * SIZE(Y), %xmm5
|
MOVLPS -2 * SIZE(Y), %xmm5
|
||||||
movhps -1 * SIZE(Y), %xmm5
|
movhps -1 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -810,11 +812,11 @@
|
||||||
testl $4, N
|
testl $4, N
|
||||||
jle .L36
|
jle .L36
|
||||||
|
|
||||||
movlps -16 * SIZE(Y), %xmm4
|
MOVLPS -16 * SIZE(Y), %xmm4
|
||||||
movhps -15 * SIZE(Y), %xmm4
|
movhps -15 * SIZE(Y), %xmm4
|
||||||
movaps -16 * SIZE(X), %xmm6
|
movaps -16 * SIZE(X), %xmm6
|
||||||
|
|
||||||
movlps -14 * SIZE(Y), %xmm5
|
MOVLPS -14 * SIZE(Y), %xmm5
|
||||||
movhps -13 * SIZE(Y), %xmm5
|
movhps -13 * SIZE(Y), %xmm5
|
||||||
movaps -14 * SIZE(X), %xmm7
|
movaps -14 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
@ -823,7 +825,7 @@
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movaps -12 * SIZE(X), %xmm6
|
movaps -12 * SIZE(X), %xmm6
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps -12 * SIZE(Y), %xmm4
|
MOVLPS -12 * SIZE(Y), %xmm4
|
||||||
movhps -11 * SIZE(Y), %xmm4
|
movhps -11 * SIZE(Y), %xmm4
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -832,7 +834,7 @@
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movaps -10 * SIZE(X), %xmm7
|
movaps -10 * SIZE(X), %xmm7
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps -10 * SIZE(Y), %xmm5
|
MOVLPS -10 * SIZE(Y), %xmm5
|
||||||
movhps -9 * SIZE(Y), %xmm5
|
movhps -9 * SIZE(Y), %xmm5
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
|
@ -856,7 +858,7 @@
|
||||||
testl $2, N
|
testl $2, N
|
||||||
jle .L37
|
jle .L37
|
||||||
|
|
||||||
movlps -16 * SIZE(Y), %xmm4
|
MOVLPS -16 * SIZE(Y), %xmm4
|
||||||
movhps -15 * SIZE(Y), %xmm4
|
movhps -15 * SIZE(Y), %xmm4
|
||||||
movaps -16 * SIZE(X), %xmm6
|
movaps -16 * SIZE(X), %xmm6
|
||||||
|
|
||||||
|
@ -866,7 +868,7 @@
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
movlps -14 * SIZE(Y), %xmm5
|
MOVLPS -14 * SIZE(Y), %xmm5
|
||||||
movhps -13 * SIZE(Y), %xmm5
|
movhps -13 * SIZE(Y), %xmm5
|
||||||
movaps -14 * SIZE(X), %xmm7
|
movaps -14 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
@ -887,7 +889,7 @@
|
||||||
testl $1, N
|
testl $1, N
|
||||||
jle .L98
|
jle .L98
|
||||||
|
|
||||||
movlps -16 * SIZE(Y), %xmm4
|
MOVLPS -16 * SIZE(Y), %xmm4
|
||||||
movhps -15 * SIZE(Y), %xmm4
|
movhps -15 * SIZE(Y), %xmm4
|
||||||
movaps -16 * SIZE(X), %xmm6
|
movaps -16 * SIZE(X), %xmm6
|
||||||
|
|
||||||
|
@ -1188,8 +1190,8 @@
|
||||||
testl $1, N
|
testl $1, N
|
||||||
jle .L48
|
jle .L48
|
||||||
|
|
||||||
movlps -16 * SIZE(X), %xmm4
|
movlpd -16 * SIZE(X), %xmm4
|
||||||
movlps -16 * SIZE(Y), %xmm6
|
movlpd -16 * SIZE(Y), %xmm6
|
||||||
|
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
|
@ -1211,17 +1213,17 @@
|
||||||
sarl $3, %eax
|
sarl $3, %eax
|
||||||
jle .L55
|
jle .L55
|
||||||
|
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
|
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
|
|
||||||
|
@ -1233,11 +1235,11 @@
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1245,11 +1247,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1257,11 +1259,11 @@
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1269,11 +1271,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1281,11 +1283,11 @@
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1293,11 +1295,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1305,11 +1307,11 @@
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1317,11 +1319,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1334,11 +1336,11 @@
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1346,11 +1348,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1358,11 +1360,11 @@
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1370,11 +1372,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1382,11 +1384,11 @@
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1394,11 +1396,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1420,28 +1422,28 @@
|
||||||
testl $4, N
|
testl $4, N
|
||||||
jle .L56
|
jle .L56
|
||||||
|
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
|
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
|
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
mulpd %xmm4, %xmm6
|
mulpd %xmm4, %xmm6
|
||||||
addpd %xmm6, %xmm0
|
addpd %xmm6, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1449,11 +1451,11 @@
|
||||||
pshufd $0x4e, %xmm7, %xmm3
|
pshufd $0x4e, %xmm7, %xmm3
|
||||||
mulpd %xmm5, %xmm7
|
mulpd %xmm5, %xmm7
|
||||||
addpd %xmm7, %xmm0
|
addpd %xmm7, %xmm0
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
mulpd %xmm5, %xmm3
|
mulpd %xmm5, %xmm3
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
@ -1475,10 +1477,10 @@
|
||||||
testl $2, N
|
testl $2, N
|
||||||
jle .L57
|
jle .L57
|
||||||
|
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
|
|
||||||
|
@ -1488,10 +1490,10 @@
|
||||||
mulpd %xmm4, %xmm3
|
mulpd %xmm4, %xmm3
|
||||||
addpd %xmm3, %xmm1
|
addpd %xmm3, %xmm1
|
||||||
|
|
||||||
movlps 0 * SIZE(X), %xmm5
|
MOVLPS 0 * SIZE(X), %xmm5
|
||||||
movhps 1 * SIZE(X), %xmm5
|
movhps 1 * SIZE(X), %xmm5
|
||||||
addl INCX, X
|
addl INCX, X
|
||||||
movlps 0 * SIZE(Y), %xmm7
|
MOVLPS 0 * SIZE(Y), %xmm7
|
||||||
movhps 1 * SIZE(Y), %xmm7
|
movhps 1 * SIZE(Y), %xmm7
|
||||||
addl INCY, Y
|
addl INCY, Y
|
||||||
|
|
||||||
|
@ -1506,9 +1508,9 @@
|
||||||
testl $1, N
|
testl $1, N
|
||||||
jle .L98
|
jle .L98
|
||||||
|
|
||||||
movlps 0 * SIZE(X), %xmm4
|
MOVLPS 0 * SIZE(X), %xmm4
|
||||||
movhps 1 * SIZE(X), %xmm4
|
movhps 1 * SIZE(X), %xmm4
|
||||||
movlps 0 * SIZE(Y), %xmm6
|
MOVLPS 0 * SIZE(Y), %xmm6
|
||||||
movhps 1 * SIZE(Y), %xmm6
|
movhps 1 * SIZE(Y), %xmm6
|
||||||
|
|
||||||
pshufd $0x4e, %xmm6, %xmm3
|
pshufd $0x4e, %xmm6, %xmm3
|
||||||
|
@ -1533,8 +1535,8 @@
|
||||||
.L999:
|
.L999:
|
||||||
movl RESULT, %eax
|
movl RESULT, %eax
|
||||||
|
|
||||||
movlps %xmm0, 0 * SIZE(%eax)
|
MOVLPS %xmm0, 0 * SIZE(%eax)
|
||||||
movlps %xmm1, 1 * SIZE(%eax)
|
MOVLPS %xmm1, 1 * SIZE(%eax)
|
||||||
|
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
|
|
|
@ -55,5 +55,6 @@ void test_saxpy_inc_0(void);
|
||||||
void test_caxpy_inc_0(void);
|
void test_caxpy_inc_0(void);
|
||||||
|
|
||||||
void test_zdotu_n_1(void);
|
void test_zdotu_n_1(void);
|
||||||
|
void test_zdotu_offset_1(void);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -53,6 +53,8 @@ CU_TestInfo test_level1[]={
|
||||||
{"Testing zaxpy with incx || incy == 0",test_zaxpy_inc_0},
|
{"Testing zaxpy with incx || incy == 0",test_zaxpy_inc_0},
|
||||||
|
|
||||||
{"Testing zdotu with n == 1",test_zdotu_n_1},
|
{"Testing zdotu with n == 1",test_zdotu_n_1},
|
||||||
|
{"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1},
|
||||||
|
|
||||||
CU_TEST_INFO_NULL,
|
CU_TEST_INFO_NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -53,4 +53,23 @@ void test_zdotu_n_1(void)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void test_zdotu_offset_1(void)
|
||||||
|
{
|
||||||
|
int N=1,incX=1,incY=1;
|
||||||
|
double x1[]={1.0,2.0,3.0,4.0};
|
||||||
|
double y1[]={5.0,6.0,7.0,8.0};
|
||||||
|
double x2[]={1.0,2.0,3.0,4.0};
|
||||||
|
double y2[]={5.0,6.0,7.0,8.0};
|
||||||
|
double _Complex result1=0.0;
|
||||||
|
double _Complex result2=0.0;
|
||||||
|
//OpenBLAS
|
||||||
|
result1=BLASFUNC(zdotu)(&N,x1+1,&incX,y1+1,&incY);
|
||||||
|
//reference
|
||||||
|
result2=BLASFUNC_REF(zdotu)(&N,x2+1,&incX,y2+1,&incY);
|
||||||
|
|
||||||
|
CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS);
|
||||||
|
CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS);
|
||||||
|
// printf("\%lf,%lf\n",creal(result1),cimag(result1));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue