Merge branch 'develop' into bulldozer
Conflicts: kernel/x86_64/KERNEL.BULLDOZER
This commit is contained in:
commit
72b1edaf1b
|
@ -4,12 +4,16 @@
|
||||||
*.dylib
|
*.dylib
|
||||||
*.def
|
*.def
|
||||||
*.o
|
*.o
|
||||||
|
*.out
|
||||||
lapack-3.1.1
|
lapack-3.1.1
|
||||||
lapack-3.1.1.tgz
|
lapack-3.1.1.tgz
|
||||||
lapack-3.4.1
|
lapack-3.4.1
|
||||||
lapack-3.4.1.tgz
|
lapack-3.4.1.tgz
|
||||||
lapack-3.4.2
|
lapack-3.4.2
|
||||||
lapack-3.4.2.tgz
|
lapack-3.4.2.tgz
|
||||||
|
lapack-netlib/make.inc
|
||||||
|
lapack-netlib/lapacke/include/lapacke_mangling.h
|
||||||
|
lapack-netlib/TESTING/testing_results.txt
|
||||||
*.so
|
*.so
|
||||||
*.a
|
*.a
|
||||||
.svn
|
.svn
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
language: c
|
||||||
|
compiler:
|
||||||
|
- gcc
|
||||||
|
|
||||||
|
env:
|
||||||
|
- TARGET_BOX=LINUX64 BTYPE="BINARY=64"
|
||||||
|
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1"
|
||||||
|
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1"
|
||||||
|
- TARGET_BOX=LINUX32 BTYPE="BINARY=32"
|
||||||
|
- TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- sudo apt-get update -qq
|
||||||
|
- sudo apt-get install -qq gfortran
|
||||||
|
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
|
||||||
|
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
|
||||||
|
|
||||||
|
script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
||||||
|
|
||||||
|
# whitelist
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
- develop
|
|
@ -0,0 +1,87 @@
|
||||||
|
# Contributions to the OpenBLAS project
|
||||||
|
|
||||||
|
## Creator & Maintainer
|
||||||
|
|
||||||
|
* Zhang Xianyi <traits.zhang@gmail.com>
|
||||||
|
|
||||||
|
## Active Developers
|
||||||
|
|
||||||
|
* Wang Qian <traz0824@gmail.com>
|
||||||
|
* Optimize BLAS3 on ICT Loongson 3A.
|
||||||
|
* Optimize BLAS3 on Intel Sandy Bridge.
|
||||||
|
|
||||||
|
* Zaheer Chothia <zaheer.chothia@gmail.com>
|
||||||
|
* Improve the compatibility about complex number
|
||||||
|
* Build LAPACKE: C interface to LAPACK
|
||||||
|
* Improve the windows build.
|
||||||
|
|
||||||
|
## Previous Developers
|
||||||
|
|
||||||
|
* Chen Shaohu <huhumartinwar@gmail.com>
|
||||||
|
* Optimize GEMV on the Loongson 3A processor.
|
||||||
|
|
||||||
|
* Luo Wen
|
||||||
|
* Intern. Test Level-2 BLAS.
|
||||||
|
|
||||||
|
## Contributors
|
||||||
|
|
||||||
|
In chronological order:
|
||||||
|
|
||||||
|
* pipping <http://page.mi.fu-berlin.de/pipping>
|
||||||
|
* [2011-06-11] Make USE_OPENMP=0 disable openmp.
|
||||||
|
|
||||||
|
* Stefan Karpinski <stefan@karpinski.org>
|
||||||
|
* [2011-12-28] Fix a bug about SystemStubs on Mac OS X.
|
||||||
|
|
||||||
|
* Alexander Eberspächer <https://github.com/aeberspaecher>
|
||||||
|
* [2012-05-02] Add note on patch for segfaults on Linux kernel 2.6.32.
|
||||||
|
|
||||||
|
* Mike Nolta <mike@nolta.net>
|
||||||
|
* [2012-05-19] Fix building bug on FreeBSD and NetBSD.
|
||||||
|
|
||||||
|
* Sylvestre Ledru <https://github.com/sylvestre>
|
||||||
|
* [2012-07-01] Improve the detection of sparc. Fix building bug under
|
||||||
|
Hurd and kfreebsd.
|
||||||
|
|
||||||
|
* Jameson Nash <https://github.com/vtjnash>
|
||||||
|
* [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
|
||||||
|
make on the command line.
|
||||||
|
|
||||||
|
* Alexander Nasonov <alnsn@yandex.ru>
|
||||||
|
* [2012-11-10] Fix NetBSD build.
|
||||||
|
|
||||||
|
* Sébastien Villemot <sebastien@debian.org>
|
||||||
|
* [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package.
|
||||||
|
|
||||||
|
* Werner Saar <wernsaar@googlemail.com>
|
||||||
|
* [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
|
||||||
|
* [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
|
||||||
|
* [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
|
||||||
|
* [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
|
||||||
|
* [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
|
||||||
|
* [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
|
||||||
|
* [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
|
||||||
|
* [2013-06-21] Optimize dcopy kernel on AMD Bulldozer
|
||||||
|
|
||||||
|
* Kang-Che Sung <Explorer09@gmail.com>
|
||||||
|
* [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c.
|
||||||
|
|
||||||
|
* Kenneth Hoste <kenneth.hoste@gmail.com>
|
||||||
|
* [2013-05-22] Adjust Makefile about downloading LAPACK source files.
|
||||||
|
|
||||||
|
* Lei WANG <https://github.com/wlbksy>
|
||||||
|
* [2013-05-22] Fix a bug about wget.
|
||||||
|
|
||||||
|
* Dan Luu <http://www.linkedin.com/in/danluu>
|
||||||
|
* [2013-06-30] Add Intel Haswell support (using sandybridge optimizations).
|
||||||
|
|
||||||
|
* grisuthedragon <https://github.com/grisuthedragon>
|
||||||
|
* [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
|
||||||
|
model is used by OpenBLAS.
|
||||||
|
|
||||||
|
* Sébastien Fabbro <bicatali@gentoo.org>
|
||||||
|
* [2013-07-24] Modify makefile to respect user's LDFLAGS
|
||||||
|
* [2013-07-24] Add stack markings for GNU as arch-independent for assembler files
|
||||||
|
|
||||||
|
* [Your name or handle] <[email or website]>
|
||||||
|
* [Date] [Brief summary of your changes]
|
|
@ -1,4 +1,54 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.2.7
|
||||||
|
20-Jul-2013
|
||||||
|
common:
|
||||||
|
* Support LSB (Linux Standard Base) 4.1.
|
||||||
|
e.g. make CC=lsbcc
|
||||||
|
* Include LAPACK 3.4.2 source codes to the repo.
|
||||||
|
Avoid downloading at compile time.
|
||||||
|
* Add NO_PARALLEL_MAKE flag to disable parallel make.
|
||||||
|
* Create openblas_get_parallel to retrieve information which
|
||||||
|
parallelization model is used by OpenBLAS. (Thank grisuthedragon)
|
||||||
|
* Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X.
|
||||||
|
* Change LIBSUFFIX from .lib to .a on windows.
|
||||||
|
* A walk round for dtrti_U single thread bug. Replace it with LAPACK codes. (#191)
|
||||||
|
|
||||||
|
x86/x86-64:
|
||||||
|
* Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
|
||||||
|
AMD Bulldozer. (Thank Werner Saar)
|
||||||
|
* Add Intel Haswell support (using Sandybridge optimizations).
|
||||||
|
(Thank Dan Luu)
|
||||||
|
* Add AMD Piledriver support (using Bulldozer optimizations).
|
||||||
|
* Fix the computational error in zgemm avx kernel on
|
||||||
|
Sandybridge. (#237)
|
||||||
|
* Fix the overflow bug in gemv.
|
||||||
|
* Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
|
||||||
|
is very large.(#214, #221, #246).
|
||||||
|
MIPS64:
|
||||||
|
* Support loongcc (Open64 based) compiler for ICT Loongson 3A/B.
|
||||||
|
|
||||||
|
Power:
|
||||||
|
* Support Power7 by old Power6 kernels. (#220)
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.2.6
|
||||||
|
2-Mar-2013
|
||||||
|
common:
|
||||||
|
* Improved OpenMP performance slightly. (d744c9)
|
||||||
|
* Improved cblas.h compatibility with Intel MKL.(#185)
|
||||||
|
* Fixed the overflowing bug in single thread cholesky factorization.
|
||||||
|
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
|
||||||
|
|
||||||
|
x86/x86-64:
|
||||||
|
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
||||||
|
We will tune the performance in future.
|
||||||
|
* Auto-detect Intel Xeon E7540.
|
||||||
|
* Fixed the overflowing buffer bug of gemv. (#173)
|
||||||
|
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
|
||||||
|
|
||||||
|
MIPS64:
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.2.5
|
Version 0.2.5
|
||||||
26-Nov-2012
|
26-Nov-2012
|
||||||
|
|
91
Makefile
91
Makefile
|
@ -82,27 +82,27 @@ endif
|
||||||
shared :
|
shared :
|
||||||
ifndef NO_SHARED
|
ifndef NO_SHARED
|
||||||
ifeq ($(OSNAME), Linux)
|
ifeq ($(OSNAME), Linux)
|
||||||
$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
ifeq ($(OSNAME), FreeBSD)
|
||||||
$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), NetBSD)
|
ifeq ($(OSNAME), NetBSD)
|
||||||
$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
$(MAKE) -C exports dyn
|
@$(MAKE) -C exports dyn
|
||||||
-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
$(MAKE) -C exports dll
|
@$(MAKE) -C exports dll
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
$(MAKE) -C exports dll
|
@$(MAKE) -C exports dll
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -131,30 +131,33 @@ endif
|
||||||
ifeq ($(NOFORTRAN), 1)
|
ifeq ($(NOFORTRAN), 1)
|
||||||
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
|
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
|
||||||
endif
|
endif
|
||||||
-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
for d in $(SUBDIRS) ; \
|
@for d in $(SUBDIRS) ; \
|
||||||
do if test -d $$d; then \
|
do if test -d $$d; then \
|
||||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||||
fi; \
|
fi; \
|
||||||
done
|
done
|
||||||
#Save the config files for installation
|
#Save the config files for installation
|
||||||
cp Makefile.conf Makefile.conf_last
|
@cp Makefile.conf Makefile.conf_last
|
||||||
cp config.h config_last.h
|
@cp config.h config_last.h
|
||||||
ifdef QUAD_PRECISION
|
ifdef QUAD_PRECISION
|
||||||
echo "#define QUAD_PRECISION">> config_last.h
|
@echo "#define QUAD_PRECISION">> config_last.h
|
||||||
endif
|
endif
|
||||||
ifeq ($(EXPRECISION), 1)
|
ifeq ($(EXPRECISION), 1)
|
||||||
echo "#define EXPRECISION">> config_last.h
|
@echo "#define EXPRECISION">> config_last.h
|
||||||
endif
|
endif
|
||||||
##
|
##
|
||||||
ifeq ($(DYNAMIC_ARCH), 1)
|
ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
$(MAKE) -C kernel commonlibs || exit 1
|
@$(MAKE) -C kernel commonlibs || exit 1
|
||||||
for d in $(DYNAMIC_CORE) ; \
|
@for d in $(DYNAMIC_CORE) ; \
|
||||||
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
||||||
done
|
done
|
||||||
echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
||||||
endif
|
endif
|
||||||
touch lib.grd
|
ifdef USE_THREAD
|
||||||
|
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||||
|
endif
|
||||||
|
@touch lib.grd
|
||||||
|
|
||||||
prof : prof_blas prof_lapack
|
prof : prof_blas prof_lapack
|
||||||
|
|
||||||
|
@ -203,19 +206,19 @@ ifeq ($(NO_LAPACK), 1)
|
||||||
netlib :
|
netlib :
|
||||||
|
|
||||||
else
|
else
|
||||||
netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
|
netlib : lapack_prebuild
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
||||||
endif
|
endif
|
||||||
ifndef NO_LAPACKE
|
ifndef NO_LAPACKE
|
||||||
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
|
prof_lapack : lapack_prebuild
|
||||||
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
||||||
|
|
||||||
$(NETLIB_LAPACK_DIR)/make.inc :
|
lapack_prebuild :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
@ -224,11 +227,7 @@ ifndef NOFORTRAN
|
||||||
-@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
ifdef INTERFACE64
|
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
||||||
else
|
|
||||||
-@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
||||||
endif
|
|
||||||
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
@ -244,7 +243,7 @@ endif
|
||||||
lapack-3.4.2 : lapack-3.4.2.tgz
|
lapack-3.4.2 : lapack-3.4.2.tgz
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
ifndef NO_LAPACK
|
ifndef NO_LAPACK
|
||||||
@if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
|
@if test `$(MD5SUM) $< | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
|
||||||
echo $(TAR) zxf $< ;\
|
echo $(TAR) zxf $< ;\
|
||||||
$(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\
|
$(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\
|
||||||
rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\
|
rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\
|
||||||
|
@ -262,27 +261,31 @@ lapack-3.4.2.tgz :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
|
||||||
curl -O $(LAPACK_URL)
|
curl -O $(LAPACK_URL);
|
||||||
else
|
else
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
ifeq ($(OSNAME), FreeBSD)
|
||||||
fetch $(LAPACK_URL)
|
fetch $(LAPACK_URL);
|
||||||
else
|
else
|
||||||
wget $(LAPACK_URL)
|
wget -O $@ $(LAPACK_URL);
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
large.tgz :
|
large.tgz :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
-wget http://www.netlib.org/lapack/timing/large.tgz
|
if [ ! -a $< ]; then
|
||||||
|
-wget http://www.netlib.org/lapack/timing/large.tgz;
|
||||||
|
fi
|
||||||
endif
|
endif
|
||||||
|
|
||||||
timing.tgz :
|
timing.tgz :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
-wget http://www.netlib.org/lapack/timing/timing.tgz
|
if [ ! -a $< ]; then
|
||||||
|
-wget http://www.netlib.org/lapack/timing/timing.tgz;
|
||||||
|
fi
|
||||||
endif
|
endif
|
||||||
|
|
||||||
lapack-timing : lapack-3.4.2 large.tgz timing.tgz
|
lapack-timing : large.tgz timing.tgz
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
||||||
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
||||||
|
@ -314,10 +317,12 @@ clean ::
|
||||||
#endif
|
#endif
|
||||||
@$(MAKE) -C reference clean
|
@$(MAKE) -C reference clean
|
||||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
||||||
|
ifeq ($(OSNAME), Darwin)
|
||||||
|
@rm -rf getarch.dSYM getarch_2nd.dSYM
|
||||||
|
endif
|
||||||
@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
||||||
@if test -d $(NETLIB_LAPACK_DIR); then \
|
@touch $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
echo deleting $(NETLIB_LAPACK_DIR); \
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
|
||||||
rm -rf $(NETLIB_LAPACK_DIR) ;\
|
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
|
||||||
fi
|
|
||||||
@rm -f *.grd Makefile.conf_last config_last.h
|
@rm -f *.grd Makefile.conf_last config_last.h
|
||||||
@echo Done.
|
@echo Done.
|
|
@ -1,6 +1 @@
|
||||||
COPT = -Wall -O2 # -DGEMMTEST
|
COPT = -Wall -O2 # -DGEMMTEST
|
||||||
ifdef BINARY64
|
|
||||||
else
|
|
||||||
# LDFLAGS = -m elf32ppc
|
|
||||||
LDFLAGS = -m elf_i386
|
|
||||||
endif
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ include ./Makefile.system
|
||||||
|
|
||||||
OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include
|
OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include
|
||||||
OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib
|
OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib
|
||||||
|
OPENBLAS_BUILD_DIR:=$(CURDIR)
|
||||||
|
|
||||||
.PHONY : install
|
.PHONY : install
|
||||||
.NOTPARALLEL : install
|
.NOTPARALLEL : install
|
||||||
|
@ -48,32 +49,36 @@ endif
|
||||||
#for install static library
|
#for install static library
|
||||||
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
|
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
|
||||||
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
|
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||||
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX)
|
@cd $(OPENBLAS_LIBRARY_DIR) ; \
|
||||||
|
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
#for install shared library
|
#for install shared library
|
||||||
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
|
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
|
||||||
ifeq ($(OSNAME), Linux)
|
ifeq ($(OSNAME), Linux)
|
||||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
@cd $(OPENBLAS_LIBRARY_DIR) ; \
|
||||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION)
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
ifeq ($(OSNAME), FreeBSD)
|
||||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
@cd $(OPENBLAS_LIBRARY_DIR) ; \
|
||||||
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), NetBSD)
|
ifeq ($(OSNAME), NetBSD)
|
||||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
@cd $(OPENBLAS_LIBRARY_DIR) ; \
|
||||||
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
|
@-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||||
-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
|
@-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
|
||||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
|
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
@echo Install OK!
|
@echo Install OK!
|
||||||
|
|
|
@ -17,13 +17,7 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
ifeq ($(OSNAME), Linux)
|
|
||||||
LDFLAGS = -m elf64ppc
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), Darwin)
|
|
||||||
LDFLAGS = -arch ppc64
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), AIX)
|
ifeq ($(OSNAME), AIX)
|
||||||
CCOMMON_OPT += -mpowerpc64 -maix64
|
CCOMMON_OPT += -mpowerpc64 -maix64
|
||||||
|
@ -34,17 +28,12 @@ ifeq ($(COMPILER_F77), xlf)
|
||||||
FCOMMON_OPT += -q64
|
FCOMMON_OPT += -q64
|
||||||
endif
|
endif
|
||||||
ARFLAGS = -X 64
|
ARFLAGS = -X 64
|
||||||
LDFLAGS = -b64
|
|
||||||
ASFLAGS = -a64
|
ASFLAGS = -a64
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
ifeq ($(OSNAME), Linux)
|
|
||||||
LDFLAGS = -m elf32ppc
|
|
||||||
endif
|
|
||||||
ifeq ($(OSNAME), AIX)
|
ifeq ($(OSNAME), AIX)
|
||||||
CCOMMON_OPT += -Wa,-a32
|
CCOMMON_OPT += -Wa,-a32
|
||||||
ARFLAGS = -X 32
|
ARFLAGS = -X 32
|
||||||
LDFLAGS = -b32
|
|
||||||
ASFLAGS = -a32
|
ASFLAGS = -a32
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.2.5
|
VERSION = 0.2.7
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -81,6 +81,9 @@ VERSION = 0.2.5
|
||||||
# and OS. However, the performance is low.
|
# and OS. However, the performance is low.
|
||||||
# NO_AVX = 1
|
# NO_AVX = 1
|
||||||
|
|
||||||
|
# Don't use parallel make.
|
||||||
|
# NO_PARALLEL_MAKE = 1
|
||||||
|
|
||||||
# If you would like to know minute performance report of GotoBLAS.
|
# If you would like to know minute performance report of GotoBLAS.
|
||||||
# FUNCTION_PROFILE = 1
|
# FUNCTION_PROFILE = 1
|
||||||
|
|
||||||
|
@ -104,8 +107,8 @@ VERSION = 0.2.5
|
||||||
|
|
||||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
||||||
# in small matrix sizes. The default value is 50.
|
# in small matrix sizes. The default value is 4.
|
||||||
# GEMM_MULTITHREAD_THRESHOLD = 50
|
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||||
|
|
||||||
# If you need santy check by comparing reference BLAS. It'll be very
|
# If you need santy check by comparing reference BLAS. It'll be very
|
||||||
# slow (Not implemented yet).
|
# slow (Not implemented yet).
|
||||||
|
|
|
@ -10,7 +10,6 @@ endif
|
||||||
ifeq ($(COMPILER_F77), f90)
|
ifeq ($(COMPILER_F77), f90)
|
||||||
FCOMMON_OPT += -xarch=v9
|
FCOMMON_OPT += -xarch=v9
|
||||||
endif
|
endif
|
||||||
LDFLAGS = -64
|
|
||||||
else
|
else
|
||||||
|
|
||||||
CCOMMON_OPT += -mcpu=v9
|
CCOMMON_OPT += -mcpu=v9
|
||||||
|
|
134
Makefile.system
134
Makefile.system
|
@ -9,9 +9,7 @@ ifndef TOPDIR
|
||||||
TOPDIR = .
|
TOPDIR = .
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef NETLIB_LAPACK_DIR
|
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Default C compiler
|
# Default C compiler
|
||||||
# - Only set if not specified on the command line or inherited from the environment.
|
# - Only set if not specified on the command line or inherited from the environment.
|
||||||
|
@ -20,6 +18,12 @@ endif
|
||||||
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
||||||
ifeq ($(origin CC),default)
|
ifeq ($(origin CC),default)
|
||||||
CC = gcc
|
CC = gcc
|
||||||
|
# Change the default compile to clang on Mac OSX.
|
||||||
|
# http://stackoverflow.com/questions/714100/os-detecting-makefile
|
||||||
|
UNAME_S := $(shell uname -s)
|
||||||
|
ifeq ($(UNAME_S),Darwin)
|
||||||
|
CC = clang
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Default Fortran compiler (FC) is selected by f_check.
|
# Default Fortran compiler (FC) is selected by f_check.
|
||||||
|
@ -53,7 +57,7 @@ GETARCH_FLAGS += -DUSE64BITINT
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef GEMM_MULTITHREAD_THRESHOLD
|
ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||||
GEMM_MULTITHREAD_THRESHOLD=50
|
GEMM_MULTITHREAD_THRESHOLD=4
|
||||||
endif
|
endif
|
||||||
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
|
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
|
||||||
|
|
||||||
|
@ -65,6 +69,19 @@ ifeq ($(DEBUG), 1)
|
||||||
GETARCH_FLAGS += -g
|
GETARCH_FLAGS += -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(QUIET_MAKE), 1)
|
||||||
|
MAKE += -s
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef NO_PARALLEL_MAKE
|
||||||
|
NO_PARALLEL_MAKE=0
|
||||||
|
endif
|
||||||
|
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
|
||||||
|
|
||||||
|
ifeq ($(HOSTCC), loongcc)
|
||||||
|
GETARCH_FLAGS += -static
|
||||||
|
endif
|
||||||
|
|
||||||
# This operation is expensive, so execution should be once.
|
# This operation is expensive, so execution should be once.
|
||||||
ifndef GOTOBLAS_MAKEFILE
|
ifndef GOTOBLAS_MAKEFILE
|
||||||
export GOTOBLAS_MAKEFILE = 1
|
export GOTOBLAS_MAKEFILE = 1
|
||||||
|
@ -148,7 +165,12 @@ EXTRALIB += -defaultlib:advapi32
|
||||||
|
|
||||||
SUFFIX = obj
|
SUFFIX = obj
|
||||||
PSUFFIX = pobj
|
PSUFFIX = pobj
|
||||||
LIBSUFFIX = lib
|
LIBSUFFIX = a
|
||||||
|
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
CCOMMON_OPT += -DMS_ABI
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
#Test for supporting MS_ABI
|
#Test for supporting MS_ABI
|
||||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||||
|
@ -167,8 +189,15 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
|
||||||
CCOMMON_OPT += -DMS_ABI
|
CCOMMON_OPT += -DMS_ABI
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# Ensure the correct stack alignment on Win32
|
||||||
|
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
|
||||||
|
ifeq ($(ARCH), x86)
|
||||||
|
CCOMMON_OPT += -mincoming-stack-boundary=2
|
||||||
|
FCOMMON_OPT += -mincoming-stack-boundary=2
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), Interix)
|
ifeq ($(OSNAME), Interix)
|
||||||
|
@ -223,11 +252,17 @@ NO_BINARY_MODE = 1
|
||||||
endif
|
endif
|
||||||
ifndef NO_EXPRECISION
|
ifndef NO_EXPRECISION
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
ifeq ($(C_COMPILER), GCC)
|
# ifeq logical or. GCC or LSB
|
||||||
|
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
||||||
EXPRECISION = 1
|
EXPRECISION = 1
|
||||||
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
|
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
|
||||||
FCOMMON_OPT += -m128bit-long-double
|
FCOMMON_OPT += -m128bit-long-double
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
EXPRECISION = 1
|
||||||
|
CCOMMON_OPT += -DEXPRECISION
|
||||||
|
FCOMMON_OPT += -m128bit-long-double
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
@ -235,11 +270,17 @@ endif
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
ifndef NO_EXPRECISION
|
ifndef NO_EXPRECISION
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
ifeq ($(C_COMPILER), GCC)
|
# ifeq logical or. GCC or LSB
|
||||||
|
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
||||||
EXPRECISION = 1
|
EXPRECISION = 1
|
||||||
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
|
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
|
||||||
FCOMMON_OPT += -m128bit-long-double
|
FCOMMON_OPT += -m128bit-long-double
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
EXPRECISION = 1
|
||||||
|
CCOMMON_OPT += -DEXPRECISION
|
||||||
|
FCOMMON_OPT += -m128bit-long-double
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
@ -249,7 +290,13 @@ CCOMMON_OPT += -wd981
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
ifeq ($(C_COMPILER), GCC)
|
# ifeq logical or. GCC or LSB
|
||||||
|
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
||||||
|
CCOMMON_OPT += -fopenmp
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
$(error OpenBLAS: Clang didn't support OpenMP yet.)
|
||||||
CCOMMON_OPT += -fopenmp
|
CCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -277,14 +324,14 @@ ifeq ($(ARCH), x86)
|
||||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -318,11 +365,18 @@ endif
|
||||||
# C Compiler dependent settings
|
# C Compiler dependent settings
|
||||||
#
|
#
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), GCC)
|
|
||||||
|
# ifeq logical or. GCC or CLANG or LSB
|
||||||
|
# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
||||||
|
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB))
|
||||||
CCOMMON_OPT += -Wall
|
CCOMMON_OPT += -Wall
|
||||||
COMMON_PROF += -fno-inline
|
COMMON_PROF += -fno-inline
|
||||||
NO_UNINITIALIZED_WARN = -Wno-uninitialized
|
NO_UNINITIALIZED_WARN = -Wno-uninitialized
|
||||||
|
|
||||||
|
ifeq ($(QUIET_MAKE), 1)
|
||||||
|
CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef NO_BINARY_MODE
|
ifdef NO_BINARY_MODE
|
||||||
|
|
||||||
ifeq ($(ARCH), mips64)
|
ifeq ($(ARCH), mips64)
|
||||||
|
@ -407,7 +461,12 @@ endif
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||||
FCOMMON_OPT += -Wall
|
FCOMMON_OPT += -Wall
|
||||||
|
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||||
|
ifneq ($(NO_LAPACK), 1)
|
||||||
|
ifneq ($(C_COMPILER), LSB)
|
||||||
EXTRALIB += -lgfortran
|
EXTRALIB += -lgfortran
|
||||||
|
endif
|
||||||
|
endif
|
||||||
ifdef NO_BINARY_MODE
|
ifdef NO_BINARY_MODE
|
||||||
ifeq ($(ARCH), mips64)
|
ifeq ($(ARCH), mips64)
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
|
@ -514,11 +573,28 @@ ifdef INTERFACE64
|
||||||
FCOMMON_OPT += -i8
|
FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), mips64)
|
||||||
|
ifndef BINARY64
|
||||||
|
FCOMMON_OPT += -n32
|
||||||
|
else
|
||||||
|
FCOMMON_OPT += -n64
|
||||||
|
endif
|
||||||
|
ifeq ($(CORE), LOONGSON3A)
|
||||||
|
FCOMMON_OPT += -loongson3 -static
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), LOONGSON3B)
|
||||||
|
FCOMMON_OPT += -loongson3 -static
|
||||||
|
endif
|
||||||
|
|
||||||
|
else
|
||||||
ifndef BINARY64
|
ifndef BINARY64
|
||||||
FCOMMON_OPT += -m32
|
FCOMMON_OPT += -m32
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -m64
|
FCOMMON_OPT += -m64
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef USE_OPENMP
|
ifdef USE_OPENMP
|
||||||
FEXTRALIB += -lstdc++
|
FEXTRALIB += -lstdc++
|
||||||
|
@ -527,12 +603,30 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), OPEN64)
|
ifeq ($(C_COMPILER), OPEN64)
|
||||||
|
|
||||||
|
ifeq ($(ARCH), mips64)
|
||||||
|
ifndef BINARY64
|
||||||
|
CCOMMON_OPT += -n32
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -n64
|
||||||
|
endif
|
||||||
|
ifeq ($(CORE), LOONGSON3A)
|
||||||
|
CCOMMON_OPT += -loongson3 -static
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), LOONGSON3B)
|
||||||
|
CCOMMON_OPT += -loongson3 -static
|
||||||
|
endif
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
ifndef BINARY64
|
ifndef BINARY64
|
||||||
CCOMMON_OPT += -m32
|
CCOMMON_OPT += -m32
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -m64
|
CCOMMON_OPT += -m64
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), SUN)
|
ifeq ($(C_COMPILER), SUN)
|
||||||
CCOMMON_OPT += -w
|
CCOMMON_OPT += -w
|
||||||
|
@ -741,6 +835,15 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||||
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
|
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
|
||||||
#MAKEOVERRIDES =
|
#MAKEOVERRIDES =
|
||||||
|
|
||||||
|
LAPACK_CFLAGS = $(CFLAGS)
|
||||||
|
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||||
|
ifdef INTERFACE64
|
||||||
|
LAPACK_CFLAGS += -DLAPACK_ILP64
|
||||||
|
endif
|
||||||
|
ifeq ($(C_COMPILER), LSB)
|
||||||
|
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef SUFFIX
|
ifndef SUFFIX
|
||||||
SUFFIX = o
|
SUFFIX = o
|
||||||
endif
|
endif
|
||||||
|
@ -835,6 +938,13 @@ export ZGEMM_UNROLL_M
|
||||||
export ZGEMM_UNROLL_N
|
export ZGEMM_UNROLL_N
|
||||||
export XGEMM_UNROLL_M
|
export XGEMM_UNROLL_M
|
||||||
export XGEMM_UNROLL_N
|
export XGEMM_UNROLL_N
|
||||||
|
export CGEMM3M_UNROLL_M
|
||||||
|
export CGEMM3M_UNROLL_N
|
||||||
|
export ZGEMM3M_UNROLL_M
|
||||||
|
export ZGEMM3M_UNROLL_N
|
||||||
|
export XGEMM3M_UNROLL_M
|
||||||
|
export XGEMM3M_UNROLL_N
|
||||||
|
|
||||||
|
|
||||||
ifdef USE_CUDA
|
ifdef USE_CUDA
|
||||||
export CUDADIR
|
export CUDADIR
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
# COMPILER_PREFIX = mingw32-
|
# COMPILER_PREFIX = mingw32-
|
||||||
|
|
||||||
ifeq ($(OSNAME), Linux)
|
|
||||||
LDFLAGS = -melf_i386
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), Interix)
|
ifeq ($(OSNAME), Interix)
|
||||||
ARFLAGS = -m x86
|
ARFLAGS = -m x86
|
||||||
|
|
|
@ -2,25 +2,12 @@
|
||||||
|
|
||||||
ifeq ($(OSNAME), SunOS)
|
ifeq ($(OSNAME), SunOS)
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
LDFLAGS = -64
|
|
||||||
ifeq ($(F_COMPILER), SUN)
|
ifeq ($(F_COMPILER), SUN)
|
||||||
FCOMMON_OPT += -m64
|
FCOMMON_OPT += -m64
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), FreeBSD)
|
|
||||||
LDFLAGS = -m elf_x86_64_fbsd
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), Linux)
|
|
||||||
LDFLAGS = -m elf_x86_64
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), Darwin)
|
|
||||||
LDFLAGS =
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), Interix)
|
ifeq ($(OSNAME), Interix)
|
||||||
ARFLAGS = -m x64
|
ARFLAGS = -m x64
|
||||||
endif
|
endif
|
||||||
|
|
43
README.md
43
README.md
|
@ -1,11 +1,20 @@
|
||||||
# OpenBLAS
|
# OpenBLAS
|
||||||
|
|
||||||
|
[](https://travis-ci.org/xianyi/OpenBLAS)
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
|
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||||
|
|
||||||
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
|
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||||
|
|
||||||
## Installation
|
## Binary Packages
|
||||||
|
We provide binary packages for the following platform.
|
||||||
|
|
||||||
|
* Windows x86/x86_64
|
||||||
|
|
||||||
|
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||||
|
|
||||||
|
## Installation from Source
|
||||||
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
||||||
|
|
||||||
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
|
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
|
||||||
|
@ -23,11 +32,15 @@ On X86 box, compile this library for loongson3a CPU.
|
||||||
|
|
||||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||||
|
|
||||||
|
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
|
||||||
|
|
||||||
|
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||||
|
|
||||||
### Debug version
|
### Debug version
|
||||||
|
|
||||||
make DEBUG=1
|
make DEBUG=1
|
||||||
|
|
||||||
### Intall to the directory (Optional)
|
### Install to the directory (optional)
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
@ -43,8 +56,10 @@ Please read GotoBLAS_01Readme.txt
|
||||||
#### x86/x86-64:
|
#### x86/x86-64:
|
||||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||||
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
|
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
|
||||||
|
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
|
||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
||||||
|
- **AMD PILEDRIVER**: Used Bulldozer codes.
|
||||||
|
|
||||||
#### MIPS64:
|
#### MIPS64:
|
||||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||||
|
@ -54,7 +69,7 @@ Please read GotoBLAS_01Readme.txt
|
||||||
- **GNU/Linux**
|
- **GNU/Linux**
|
||||||
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||||
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
|
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||||
|
|
||||||
## Usages
|
## Usages
|
||||||
Link with libopenblas.a or -lopenblas for shared library.
|
Link with libopenblas.a or -lopenblas for shared library.
|
||||||
|
@ -79,7 +94,7 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro
|
||||||
|
|
||||||
### Set the number of threads on runtime.
|
### Set the number of threads on runtime.
|
||||||
|
|
||||||
We provided the below functions to controll the number of threads on runtime.
|
We provided the below functions to control the number of threads on runtime.
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads);
|
void goto_set_num_threads(int num_threads);
|
||||||
|
|
||||||
|
@ -91,7 +106,8 @@ If you compile this lib with USE_OPENMP=1, you should use the above functions, t
|
||||||
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
||||||
|
|
||||||
## Contact
|
## Contact
|
||||||
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
|
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
|
||||||
|
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
|
||||||
|
|
||||||
## ChangeLog
|
## ChangeLog
|
||||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
||||||
|
@ -104,10 +120,9 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
|
||||||
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
|
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
|
||||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||||
|
|
||||||
## Specification of Git Branches
|
## Contributing
|
||||||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
|
||||||
Now, there are 4 branches in github.com.
|
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||||
* The master branch. This a main branch to reflect a production-ready state.
|
1. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||||
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
|
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||||
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
|
|
||||||
* The gh-pages branch. This is for web pages
|
|
||||||
|
|
|
@ -8,8 +8,8 @@ Supported List:
|
||||||
1.X86/X86_64
|
1.X86/X86_64
|
||||||
a)Intel CPU:
|
a)Intel CPU:
|
||||||
P2
|
P2
|
||||||
COPPERMINE
|
|
||||||
KATMAI
|
KATMAI
|
||||||
|
COPPERMINE
|
||||||
NORTHWOOD
|
NORTHWOOD
|
||||||
PRESCOTT
|
PRESCOTT
|
||||||
BANIAS
|
BANIAS
|
||||||
|
|
18
c_check
18
c_check
|
@ -33,6 +33,8 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
|
||||||
}
|
}
|
||||||
|
|
||||||
$compiler = "";
|
$compiler = "";
|
||||||
|
$compiler = LSB if ($data =~ /COMPILER_LSB/);
|
||||||
|
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
|
||||||
$compiler = PGI if ($data =~ /COMPILER_PGI/);
|
$compiler = PGI if ($data =~ /COMPILER_PGI/);
|
||||||
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
|
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
|
||||||
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
|
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
|
||||||
|
@ -117,7 +119,11 @@ if ($compiler eq "OPEN64") {
|
||||||
$openmp = "-mp";
|
$openmp = "-mp";
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($compiler eq "GCC") {
|
if ($compiler eq "CLANG") {
|
||||||
|
$openmp = "-fopenmp";
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($compiler eq "GCC" || $compiler eq "LSB") {
|
||||||
$openmp = "-fopenmp";
|
$openmp = "-fopenmp";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -241,13 +247,13 @@ print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||||
|
|
||||||
if ($os eq "LINUX") {
|
if ($os eq "LINUX") {
|
||||||
|
|
||||||
@pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
|
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
|
||||||
|
|
||||||
if ($pthread[2] ne "") {
|
# if ($pthread[2] ne "") {
|
||||||
print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
|
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
|
||||||
} else {
|
# } else {
|
||||||
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
||||||
}
|
# }
|
||||||
} else {
|
} else {
|
||||||
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
||||||
}
|
}
|
||||||
|
|
10
cblas.h
10
cblas.h
|
@ -16,6 +16,16 @@ void goto_set_num_threads(int num_threads);
|
||||||
/*Get the build configure on runtime.*/
|
/*Get the build configure on runtime.*/
|
||||||
char* openblas_get_config(void);
|
char* openblas_get_config(void);
|
||||||
|
|
||||||
|
/* Get the parallelization type which is used by OpenBLAS */
|
||||||
|
int openblas_get_parallel(void);
|
||||||
|
/* OpenBLAS is compiled for sequential use */
|
||||||
|
#define OPENBLAS_SEQUENTIAL 0
|
||||||
|
/* OpenBLAS is compiled using normal threading model */
|
||||||
|
#define OPENBLAS_THREAD 1
|
||||||
|
/* OpenBLAS is compiled using OpenMP threading model */
|
||||||
|
#define OPENBLAS_OPENMP 2
|
||||||
|
|
||||||
|
|
||||||
#define CBLAS_INDEX size_t
|
#define CBLAS_INDEX size_t
|
||||||
|
|
||||||
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
|
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
|
||||||
|
|
17
common.h
17
common.h
|
@ -314,6 +314,23 @@ typedef int blasint;
|
||||||
#define YIELDING sched_yield()
|
#define YIELDING sched_yield()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/***
|
||||||
|
To alloc job_t on heap or statck.
|
||||||
|
please https://github.com/xianyi/OpenBLAS/issues/246
|
||||||
|
***/
|
||||||
|
#if defined(OS_WINDOWS)
|
||||||
|
#define GETRF_MEM_ALLOC_THRESHOLD 32
|
||||||
|
#define BLAS3_MEM_ALLOC_THRESHOLD 32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef GETRF_MEM_ALLOC_THRESHOLD
|
||||||
|
#define GETRF_MEM_ALLOC_THRESHOLD 80
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
#define BLAS3_MEM_ALLOC_THRESHOLD 160
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef QUAD_PRECISION
|
#ifdef QUAD_PRECISION
|
||||||
#include "common_quad.h"
|
#include "common_quad.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -150,9 +150,17 @@ REALNAME:
|
||||||
#define PROFCODE .prologue 0
|
#define PROFCODE .prologue 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__ELF__)
|
||||||
|
#define GNUSTACK .section .note.GNU-stack,"",%progbits
|
||||||
|
#else
|
||||||
|
#define GNUSTACK
|
||||||
|
#endif
|
||||||
|
|
||||||
#define EPILOGUE \
|
#define EPILOGUE \
|
||||||
.end REALNAME; \
|
.end REALNAME; \
|
||||||
.ident VERSION
|
.ident VERSION; \
|
||||||
|
GNUSTACK
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
|
|
|
@ -379,8 +379,15 @@ REALNAME:
|
||||||
#define PROFCODE
|
#define PROFCODE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__ELF__)
|
||||||
|
#define GNUSTACK .section .note.GNU-stack,"",%progbits
|
||||||
|
#else
|
||||||
|
#define GNUSTACK
|
||||||
|
#endif
|
||||||
|
|
||||||
#define EPILOGUE \
|
#define EPILOGUE \
|
||||||
.endp REALNAME
|
.endp REALNAME ; \
|
||||||
|
GNUSTACK
|
||||||
|
|
||||||
#define START_ADDRESS 0x20000fc800000000UL
|
#define START_ADDRESS 0x20000fc800000000UL
|
||||||
|
|
||||||
|
|
|
@ -65,9 +65,16 @@ extern long int syscall (long int __sysno, ...);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static inline int my_mbind(void *addr, unsigned long len, int mode,
|
static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||||
unsigned long *nodemask, unsigned long maxnode,
|
unsigned long *nodemask, unsigned long maxnode,
|
||||||
unsigned flags) {
|
unsigned flags) {
|
||||||
|
#if defined (__LSB_VERSION__)
|
||||||
|
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||||
|
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
#if defined (LOONGSON3B)
|
#if defined (LOONGSON3B)
|
||||||
#if defined (__64BIT__)
|
#if defined (__64BIT__)
|
||||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||||
|
@ -79,11 +86,17 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||||
// unsigned long null_nodemask=0;
|
// unsigned long null_nodemask=0;
|
||||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||||
|
#if defined (__LSB_VERSION__)
|
||||||
|
// So far, LSB (Linux Standard Base) don't support syscall().
|
||||||
|
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
return syscall(SYS_set_mempolicy, mode, addr, flag);
|
return syscall(SYS_set_mempolicy, mode, addr, flag);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int my_gettid(void) {
|
static inline int my_gettid(void) {
|
||||||
|
|
|
@ -235,10 +235,17 @@ REALNAME: ;\
|
||||||
.set noreorder ;\
|
.set noreorder ;\
|
||||||
.set nomacro
|
.set nomacro
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__ELF__)
|
||||||
|
#define GNUSTACK .section .note.GNU-stack,"",%progbits
|
||||||
|
#else
|
||||||
|
#define GNUSTACK
|
||||||
|
#endif
|
||||||
|
|
||||||
#define EPILOGUE \
|
#define EPILOGUE \
|
||||||
.set macro ;\
|
.set macro ;\
|
||||||
.set reorder ;\
|
.set reorder ;\
|
||||||
.end REALNAME
|
.end REALNAME ;\
|
||||||
|
GNUSTACK
|
||||||
|
|
||||||
#define PROFCODE
|
#define PROFCODE
|
||||||
#endif
|
#endif
|
||||||
|
@ -255,8 +262,8 @@ REALNAME: ;\
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LOONGSON3B)
|
#if defined(LOONGSON3B)
|
||||||
#define PAGESIZE (32UL << 10)
|
#define PAGESIZE (16UL << 10)
|
||||||
#define FIXED_PAGESIZE (32UL << 10)
|
#define FIXED_PAGESIZE (16UL << 10)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef PAGESIZE
|
#ifndef PAGESIZE
|
||||||
|
|
|
@ -199,8 +199,17 @@ static __inline int blas_quickdivide(blasint x, blasint y){
|
||||||
.type REALNAME, #function; \
|
.type REALNAME, #function; \
|
||||||
.proc 07; \
|
.proc 07; \
|
||||||
REALNAME:;
|
REALNAME:;
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__ELF__)
|
||||||
|
#define GNUSTACK .section .note.GNU-stack,"",%progbits
|
||||||
|
#else
|
||||||
|
#define GNUSTACK
|
||||||
|
#endif
|
||||||
|
|
||||||
#define EPILOGUE \
|
#define EPILOGUE \
|
||||||
.size REALNAME, .-REALNAME
|
.size REALNAME, .-REALNAME; \
|
||||||
|
GNUSTACK
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
13
common_x86.h
13
common_x86.h
|
@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#define MMXSTORE movd
|
#define MMXSTORE movd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
||||||
|
//Enable some optimazation for barcelona.
|
||||||
|
#define BARCELONA_OPTIMIZATION
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_3DNOW)
|
#if defined(HAVE_3DNOW)
|
||||||
#define EMMS femms
|
#define EMMS femms
|
||||||
#elif defined(HAVE_MMX)
|
#elif defined(HAVE_MMX)
|
||||||
|
@ -296,7 +301,9 @@ REALNAME:
|
||||||
#define PROFCODE
|
#define PROFCODE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define EPILOGUE .size REALNAME, .-REALNAME
|
#define EPILOGUE \
|
||||||
|
.size REALNAME, .-REALNAME; \
|
||||||
|
.section .note.GNU-stack,"",%progbits
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -335,6 +342,7 @@ REALNAME:
|
||||||
#define ALIGN_2 .align 2
|
#define ALIGN_2 .align 2
|
||||||
#define ALIGN_3 .align 3
|
#define ALIGN_3 .align 3
|
||||||
#define ALIGN_4 .align 4
|
#define ALIGN_4 .align 4
|
||||||
|
#define ALIGN_5 .align 5
|
||||||
#define ffreep fstp
|
#define ffreep fstp
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -356,11 +364,10 @@ REALNAME:
|
||||||
|
|
||||||
#ifndef ALIGN_6
|
#ifndef ALIGN_6
|
||||||
#define ALIGN_6 .align 64
|
#define ALIGN_6 .align 64
|
||||||
|
#endif
|
||||||
// ffreep %st(0).
|
// ffreep %st(0).
|
||||||
// Because Clang didn't support ffreep, we directly use the opcode.
|
// Because Clang didn't support ffreep, we directly use the opcode.
|
||||||
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
|
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
|
||||||
#ifndef ffreep
|
#ifndef ffreep
|
||||||
#define ffreep .byte 0xdf, 0xc0 #
|
#define ffreep .byte 0xdf, 0xc0 #
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
|
@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
|
|
||||||
#ifdef ASSEMBLER
|
#ifdef ASSEMBLER
|
||||||
|
|
||||||
|
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
||||||
|
//Enable some optimazation for barcelona.
|
||||||
|
#define BARCELONA_OPTIMIZATION
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_3DNOW)
|
#if defined(HAVE_3DNOW)
|
||||||
#define EMMS femms
|
#define EMMS femms
|
||||||
#elif defined(HAVE_MMX)
|
#elif defined(HAVE_MMX)
|
||||||
|
@ -367,7 +372,10 @@ REALNAME:
|
||||||
#define PROFCODE
|
#define PROFCODE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define EPILOGUE .size REALNAME, .-REALNAME
|
#define EPILOGUE \
|
||||||
|
.size REALNAME, .-REALNAME; \
|
||||||
|
.section .note.GNU-stack,"",%progbits
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
7
cpuid.h
7
cpuid.h
|
@ -106,6 +106,8 @@
|
||||||
#define CORE_SANDYBRIDGE 20
|
#define CORE_SANDYBRIDGE 20
|
||||||
#define CORE_BOBCAT 21
|
#define CORE_BOBCAT 21
|
||||||
#define CORE_BULLDOZER 22
|
#define CORE_BULLDOZER 22
|
||||||
|
#define CORE_PILEDRIVER 23
|
||||||
|
#define CORE_HASWELL CORE_SANDYBRIDGE
|
||||||
|
|
||||||
#define HAVE_SSE (1 << 0)
|
#define HAVE_SSE (1 << 0)
|
||||||
#define HAVE_SSE2 (1 << 1)
|
#define HAVE_SSE2 (1 << 1)
|
||||||
|
@ -127,6 +129,7 @@
|
||||||
#define HAVE_FASTMOVU (1 << 17)
|
#define HAVE_FASTMOVU (1 << 17)
|
||||||
#define HAVE_AVX (1 << 18)
|
#define HAVE_AVX (1 << 18)
|
||||||
#define HAVE_FMA4 (1 << 19)
|
#define HAVE_FMA4 (1 << 19)
|
||||||
|
#define HAVE_FMA3 (1 << 20)
|
||||||
|
|
||||||
#define CACHE_INFO_L1_I 1
|
#define CACHE_INFO_L1_I 1
|
||||||
#define CACHE_INFO_L1_D 2
|
#define CACHE_INFO_L1_D 2
|
||||||
|
@ -196,4 +199,8 @@ typedef struct {
|
||||||
#define CPUTYPE_SANDYBRIDGE 44
|
#define CPUTYPE_SANDYBRIDGE 44
|
||||||
#define CPUTYPE_BOBCAT 45
|
#define CPUTYPE_BOBCAT 45
|
||||||
#define CPUTYPE_BULLDOZER 46
|
#define CPUTYPE_BULLDOZER 46
|
||||||
|
#define CPUTYPE_PILEDRIVER 47
|
||||||
|
// this define is because BLAS doesn't have haswell specific optimizations yet
|
||||||
|
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -114,6 +114,7 @@ int detect(void){
|
||||||
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
|
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
|
||||||
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
||||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||||
|
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||||
|
|
||||||
|
|
63
cpuid_x86.c
63
cpuid_x86.c
|
@ -41,10 +41,14 @@
|
||||||
#include "cpuid.h"
|
#include "cpuid.h"
|
||||||
|
|
||||||
#ifdef NO_AVX
|
#ifdef NO_AVX
|
||||||
|
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
||||||
|
#define CORE_HASWELL CORE_NEHALEM
|
||||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
||||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||||
#define CORE_BULLDOZER CORE_BARCELONA
|
#define CORE_BULLDOZER CORE_BARCELONA
|
||||||
|
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
|
||||||
|
#define CORE_PILEDRIVER CORE_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPUIDEMU
|
#ifndef CPUIDEMU
|
||||||
|
@ -130,7 +134,7 @@ int support_avx(){
|
||||||
int ret=0;
|
int ret=0;
|
||||||
|
|
||||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||||
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
|
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
|
||||||
xgetbv(0, &eax, &edx);
|
xgetbv(0, &eax, &edx);
|
||||||
if((eax & 6) == 6){
|
if((eax & 6) == 6){
|
||||||
ret=1; //OS support AVX
|
ret=1; //OS support AVX
|
||||||
|
@ -225,6 +229,7 @@ int get_cputype(int gettype){
|
||||||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
if (support_avx()) feature |= HAVE_AVX;
|
if (support_avx()) feature |= HAVE_AVX;
|
||||||
|
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (have_excpuid() >= 0x01) {
|
if (have_excpuid() >= 0x01) {
|
||||||
|
@ -1050,6 +1055,20 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
|
case 12:
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
switch (model) {
|
||||||
|
case 5:
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1084,11 +1103,21 @@ int get_cpuname(void){
|
||||||
case 1:
|
case 1:
|
||||||
case 10:
|
case 10:
|
||||||
return CPUTYPE_BARCELONA;
|
return CPUTYPE_BARCELONA;
|
||||||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
case 6:
|
||||||
|
switch (model) {
|
||||||
|
case 1:
|
||||||
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return CPUTYPE_BULLDOZER;
|
return CPUTYPE_BULLDOZER;
|
||||||
else
|
else
|
||||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||||
|
case 2:
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_PILEDRIVER;
|
||||||
|
else
|
||||||
|
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
return CPUTYPE_BOBCAT;
|
return CPUTYPE_BOBCAT;
|
||||||
}
|
}
|
||||||
|
@ -1213,6 +1242,7 @@ static char *cpuname[] = {
|
||||||
"SANDYBRIDGE",
|
"SANDYBRIDGE",
|
||||||
"BOBCAT",
|
"BOBCAT",
|
||||||
"BULLDOZER",
|
"BULLDOZER",
|
||||||
|
"PILEDRIVER",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1262,6 +1292,7 @@ static char *lowercpuname[] = {
|
||||||
"sandybridge",
|
"sandybridge",
|
||||||
"bobcat",
|
"bobcat",
|
||||||
"bulldozer",
|
"bulldozer",
|
||||||
|
"piledriver",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
@ -1288,6 +1319,7 @@ static char *corename[] = {
|
||||||
"SANDYBRIDGE",
|
"SANDYBRIDGE",
|
||||||
"BOBCAT",
|
"BOBCAT",
|
||||||
"BULLDOZER",
|
"BULLDOZER",
|
||||||
|
"PILEDRIVER",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1314,6 +1346,7 @@ static char *corename_lower[] = {
|
||||||
"sandybridge",
|
"sandybridge",
|
||||||
"bobcat",
|
"bobcat",
|
||||||
"bulldozer",
|
"bulldozer",
|
||||||
|
"piledriver",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -1424,6 +1457,20 @@ int get_coretype(void){
|
||||||
return CORE_SANDYBRIDGE;
|
return CORE_SANDYBRIDGE;
|
||||||
else
|
else
|
||||||
return CORE_NEHALEM; //OS doesn't support AVX
|
return CORE_NEHALEM; //OS doesn't support AVX
|
||||||
|
case 12:
|
||||||
|
if(support_avx())
|
||||||
|
return CORE_HASWELL;
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
switch (model) {
|
||||||
|
case 5:
|
||||||
|
if(support_avx())
|
||||||
|
return CORE_HASWELL;
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1442,11 +1489,19 @@ int get_coretype(void){
|
||||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||||
else if (exfamily == 5) return CORE_BOBCAT;
|
else if (exfamily == 5) return CORE_BOBCAT;
|
||||||
else if (exfamily == 6) {
|
else if (exfamily == 6) {
|
||||||
|
switch (model) {
|
||||||
|
case 1:
|
||||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return CORE_BULLDOZER;
|
return CORE_BULLDOZER;
|
||||||
else
|
else
|
||||||
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
|
return CORE_BARCELONA; //OS don't support AVX.
|
||||||
|
case 2:
|
||||||
|
if(support_avx())
|
||||||
|
return CORE_PILEDRIVER;
|
||||||
|
else
|
||||||
|
return CORE_BARCELONA; //OS don't support AVX.
|
||||||
|
}
|
||||||
}else return CORE_BARCELONA;
|
}else return CORE_BARCELONA;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1534,6 +1589,7 @@ void get_cpuconfig(void){
|
||||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||||
|
if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n");
|
||||||
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
||||||
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
|
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
|
||||||
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
|
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
|
||||||
|
@ -1601,5 +1657,6 @@ void get_sse(void){
|
||||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||||
|
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
14
ctest.c
14
ctest.c
|
@ -1,3 +1,17 @@
|
||||||
|
//LSB (Linux Standard Base) compiler
|
||||||
|
//only support lsbc++
|
||||||
|
#if defined (__LSB_VERSION__)
|
||||||
|
#if !defined (__cplusplus)
|
||||||
|
COMPILER_LSB
|
||||||
|
#else
|
||||||
|
#error "OpenBLAS only supports lsbcc."
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__clang__)
|
||||||
|
COMPILER_CLANG
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__PGI) || defined(__PGIC__)
|
#if defined(__PGI) || defined(__PGIC__)
|
||||||
COMPILER_PGI
|
COMPILER_PGI
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -77,7 +77,7 @@ endif
|
||||||
clean ::
|
clean ::
|
||||||
rm -f x*
|
rm -f x*
|
||||||
|
|
||||||
FLDFLAGS = $(FFLAGS:-fPIC=)
|
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
||||||
CEXTRALIB =
|
CEXTRALIB =
|
||||||
|
|
||||||
# Single real
|
# Single real
|
||||||
|
|
|
@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
|
|
||||||
a = (FLOAT *)args -> a;
|
a = (FLOAT *)args -> a;
|
||||||
x = (FLOAT *)args -> b;
|
x = (FLOAT *)args -> b;
|
||||||
y = (FLOAT *)args -> c;
|
|
||||||
|
|
||||||
lda = args -> lda;
|
lda = args -> lda;
|
||||||
incx = args -> ldb;
|
incx = args -> ldb;
|
||||||
|
@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
n_from = 0;
|
n_from = 0;
|
||||||
n_to = n;
|
n_to = n;
|
||||||
|
|
||||||
|
//Use y as each thread's n* COMPSIZE elements in sb buffer
|
||||||
|
y = buffer;
|
||||||
|
buffer += ((COMPSIZE * n + 1023) & ~1023);
|
||||||
|
|
||||||
if (range_m) {
|
if (range_m) {
|
||||||
n_from = *(range_m + 0);
|
n_from = *(range_m + 0);
|
||||||
n_to = *(range_m + 1);
|
n_to = *(range_m + 1);
|
||||||
|
@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
a += n_from * lda * COMPSIZE;
|
a += n_from * lda * COMPSIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (range_n) y += *range_n * COMPSIZE;
|
|
||||||
|
|
||||||
if (incx != 1) {
|
if (incx != 1) {
|
||||||
COPY_K(n, x, incx, buffer, 1);
|
COPY_K(n, x, incx, buffer, 1);
|
||||||
|
@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
|
|
||||||
if (num_cpu) {
|
if (num_cpu) {
|
||||||
queue[0].sa = NULL;
|
queue[0].sa = NULL;
|
||||||
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
|
queue[0].sb = buffer;
|
||||||
queue[num_cpu - 1].next = NULL;
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
exec_blas(num_cpu, queue);
|
exec_blas(num_cpu, queue);
|
||||||
|
@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
#else
|
#else
|
||||||
ONE, ZERO,
|
ONE, ZERO,
|
||||||
#endif
|
#endif
|
||||||
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
|
(FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
AXPYU_K(n, 0, 0,
|
AXPYU_K(n, 0, 0,
|
||||||
|
|
|
@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||||
queue[num_cpu].args = arg;
|
queue[num_cpu].args = arg;
|
||||||
queue[num_cpu].range_m = range_m;
|
queue[num_cpu].range_m = range_m;
|
||||||
queue[num_cpu].range_n = &range[num_cpu];
|
queue[num_cpu].range_n = &range[num_cpu];
|
||||||
#if defined(LOONGSON3A)
|
#if 0 //defined(LOONGSON3A)
|
||||||
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
|
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
|
||||||
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
|
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
|
||||||
#else
|
#else
|
||||||
|
@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||||
}
|
}
|
||||||
|
|
||||||
if (num_cpu) {
|
if (num_cpu) {
|
||||||
#if defined(LOONGSON3A)
|
#if 0 //defined(LOONGSON3A)
|
||||||
queue[0].sa = sa;
|
queue[0].sa = sa;
|
||||||
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
|
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -332,7 +332,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
#else
|
#else
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
|
||||||
|
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
|
||||||
|
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
|
||||||
|
else
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
else
|
||||||
|
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||||
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
|
|
||||||
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,12 @@
|
||||||
#define SWITCH_RATIO 2
|
#define SWITCH_RATIO 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//The array of job_t may overflow the stack.
|
||||||
|
//Instead, use malloc to alloc job_t.
|
||||||
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
#define USE_ALLOC_HEAP
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef GEMM3M_LOCAL
|
#ifndef GEMM3M_LOCAL
|
||||||
#if defined(NN)
|
#if defined(NN)
|
||||||
#define GEMM3M_LOCAL GEMM3M_NN
|
#define GEMM3M_LOCAL GEMM3M_NN
|
||||||
|
@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
BLASLONG range_M[MAX_CPU_NUMBER + 1];
|
BLASLONG range_M[MAX_CPU_NUMBER + 1];
|
||||||
BLASLONG range_N[MAX_CPU_NUMBER + 1];
|
BLASLONG range_N[MAX_CPU_NUMBER + 1];
|
||||||
|
|
||||||
|
#ifndef USE_ALLOC_HEAP
|
||||||
job_t job[MAX_CPU_NUMBER];
|
job_t job[MAX_CPU_NUMBER];
|
||||||
|
#else
|
||||||
|
job_t * job = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
BLASLONG num_cpu_m, num_cpu_n;
|
BLASLONG num_cpu_m, num_cpu_n;
|
||||||
|
|
||||||
|
@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
newarg.alpha = args -> alpha;
|
newarg.alpha = args -> alpha;
|
||||||
newarg.beta = args -> beta;
|
newarg.beta = args -> beta;
|
||||||
newarg.nthreads = args -> nthreads;
|
newarg.nthreads = args -> nthreads;
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||||
|
if(job==NULL){
|
||||||
|
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
newarg.common = (void *)job;
|
newarg.common = (void *)job;
|
||||||
|
|
||||||
if (!range_m) {
|
if (!range_m) {
|
||||||
|
@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
exec_blas(num_cpu_m, queue);
|
exec_blas(num_cpu_m, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
free(job);
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,12 @@
|
||||||
#define SWITCH_RATIO 2
|
#define SWITCH_RATIO 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//The array of job_t may overflow the stack.
|
||||||
|
//Instead, use malloc to alloc job_t.
|
||||||
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
#define USE_ALLOC_HEAP
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef SYRK_LOCAL
|
#ifndef SYRK_LOCAL
|
||||||
#if !defined(LOWER) && !defined(TRANS)
|
#if !defined(LOWER) && !defined(TRANS)
|
||||||
#define SYRK_LOCAL SYRK_UN
|
#define SYRK_LOCAL SYRK_UN
|
||||||
|
@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
blas_arg_t newarg;
|
blas_arg_t newarg;
|
||||||
|
|
||||||
|
#ifndef USE_ALLOC_HEAP
|
||||||
job_t job[MAX_CPU_NUMBER];
|
job_t job[MAX_CPU_NUMBER];
|
||||||
|
#else
|
||||||
|
job_t * job = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||||
|
|
||||||
BLASLONG range[MAX_CPU_NUMBER + 100];
|
BLASLONG range[MAX_CPU_NUMBER + 100];
|
||||||
|
@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
newarg.ldc = args -> ldc;
|
newarg.ldc = args -> ldc;
|
||||||
newarg.alpha = args -> alpha;
|
newarg.alpha = args -> alpha;
|
||||||
newarg.beta = args -> beta;
|
newarg.beta = args -> beta;
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||||
|
if(job==NULL){
|
||||||
|
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
newarg.common = (void *)job;
|
newarg.common = (void *)job;
|
||||||
|
|
||||||
if (!range_n) {
|
if (!range_n) {
|
||||||
|
@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
exec_blas(num_cpu, queue);
|
exec_blas(num_cpu, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
free(job);
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,6 +48,12 @@
|
||||||
#define SWITCH_RATIO 2
|
#define SWITCH_RATIO 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//The array of job_t may overflow the stack.
|
||||||
|
//Instead, use malloc to alloc job_t.
|
||||||
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
#define USE_ALLOC_HEAP
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef GEMM_LOCAL
|
#ifndef GEMM_LOCAL
|
||||||
#if defined(NN)
|
#if defined(NN)
|
||||||
#define GEMM_LOCAL GEMM_NN
|
#define GEMM_LOCAL GEMM_NN
|
||||||
|
@ -360,7 +366,19 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||||
|
|
||||||
|
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
|
||||||
|
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
|
||||||
|
else
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
else
|
||||||
|
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||||
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
|
|
||||||
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -519,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
|
|
||||||
blas_arg_t newarg;
|
blas_arg_t newarg;
|
||||||
|
|
||||||
|
#ifndef USE_ALLOC_HEAP
|
||||||
job_t job[MAX_CPU_NUMBER];
|
job_t job[MAX_CPU_NUMBER];
|
||||||
|
#else
|
||||||
|
job_t * job = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||||
|
|
||||||
BLASLONG range_M[MAX_CPU_NUMBER + 1];
|
BLASLONG range_M[MAX_CPU_NUMBER + 1];
|
||||||
|
@ -563,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
newarg.alpha = args -> alpha;
|
newarg.alpha = args -> alpha;
|
||||||
newarg.beta = args -> beta;
|
newarg.beta = args -> beta;
|
||||||
newarg.nthreads = args -> nthreads;
|
newarg.nthreads = args -> nthreads;
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||||
|
if(job==NULL){
|
||||||
|
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
newarg.common = (void *)job;
|
newarg.common = (void *)job;
|
||||||
|
|
||||||
#ifdef PARAMTEST
|
#ifdef PARAMTEST
|
||||||
|
@ -648,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
exec_blas(num_cpu_m, queue);
|
exec_blas(num_cpu_m, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
free(job);
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
TOPDIR = ../..
|
TOPDIR = ../..
|
||||||
include ../../Makefile.system
|
include ../../Makefile.system
|
||||||
|
|
||||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
|
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX)
|
||||||
|
|
||||||
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||||
|
|
||||||
|
@ -106,6 +106,9 @@ openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
|
||||||
openblas_get_config.$(SUFFIX) : openblas_get_config.c
|
openblas_get_config.$(SUFFIX) : openblas_get_config.c
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
|
||||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
queue->sb=sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
|
|
@ -49,8 +49,12 @@
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
|
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads) {
|
void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
|
int i=0;
|
||||||
|
|
||||||
if (num_threads < 1) num_threads = blas_num_threads;
|
if (num_threads < 1) num_threads = blas_num_threads;
|
||||||
|
|
||||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||||
|
@ -63,6 +67,18 @@ void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
omp_set_num_threads(blas_cpu_number);
|
omp_set_num_threads(blas_cpu_number);
|
||||||
|
|
||||||
|
//adjust buffer for each thread
|
||||||
|
for(i=0; i<blas_cpu_number; i++){
|
||||||
|
if(blas_thread_buffer[i]==NULL){
|
||||||
|
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(; i<MAX_CPU_NUMBER; i++){
|
||||||
|
if(blas_thread_buffer[i]!=NULL){
|
||||||
|
blas_memory_free(blas_thread_buffer[i]);
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
#if defined(ARCH_MIPS64)
|
#if defined(ARCH_MIPS64)
|
||||||
//set parameters for different number of threads.
|
//set parameters for different number of threads.
|
||||||
blas_set_parameter();
|
blas_set_parameter();
|
||||||
|
@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
int blas_thread_init(void){
|
int blas_thread_init(void){
|
||||||
|
|
||||||
|
int i=0;
|
||||||
|
|
||||||
blas_get_cpu_number();
|
blas_get_cpu_number();
|
||||||
|
|
||||||
blas_server_avail = 1;
|
blas_server_avail = 1;
|
||||||
|
|
||||||
|
for(i=0; i<blas_num_threads; i++){
|
||||||
|
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
|
for(; i<MAX_CPU_NUMBER; i++){
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int BLASFUNC(blas_thread_shutdown)(void){
|
int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
|
int i=0;
|
||||||
blas_server_avail = 0;
|
blas_server_avail = 0;
|
||||||
|
|
||||||
|
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||||
|
if(blas_thread_buffer[i]!=NULL){
|
||||||
|
blas_memory_free(blas_thread_buffer[i]);
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -177,6 +209,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||||
static void exec_threads(blas_queue_t *queue){
|
static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
void *buffer, *sa, *sb;
|
void *buffer, *sa, *sb;
|
||||||
|
int pos=0, release_flag=0;
|
||||||
|
|
||||||
buffer = NULL;
|
buffer = NULL;
|
||||||
sa = queue -> sa;
|
sa = queue -> sa;
|
||||||
|
@ -189,9 +222,19 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||||
|
|
||||||
buffer = blas_memory_alloc(2);
|
pos = omp_get_thread_num();
|
||||||
|
buffer = blas_thread_buffer[pos];
|
||||||
|
|
||||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
//fallback
|
||||||
|
if(buffer==NULL) {
|
||||||
|
buffer = blas_memory_alloc(2);
|
||||||
|
release_flag=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sa == NULL) {
|
||||||
|
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||||
|
queue->sa=sa;
|
||||||
|
}
|
||||||
|
|
||||||
if (sb == NULL) {
|
if (sb == NULL) {
|
||||||
if (!(queue -> mode & BLAS_COMPLEX)){
|
if (!(queue -> mode & BLAS_COMPLEX)){
|
||||||
|
@ -224,6 +267,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
queue->sb=sb;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -241,7 +285,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (buffer != NULL) blas_memory_free(buffer);
|
if (release_flag) blas_memory_free(buffer);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
queue->sb=sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
|
|
@ -64,12 +64,15 @@ extern gotoblas_t gotoblas_BOBCAT;
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||||
extern gotoblas_t gotoblas_BULLDOZER;
|
extern gotoblas_t gotoblas_BULLDOZER;
|
||||||
|
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||||
#else
|
#else
|
||||||
//Use NEHALEM kernels for sandy bridge
|
//Use NEHALEM kernels for sandy bridge
|
||||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||||
|
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
//Use sandy bridge kernels for haswell.
|
||||||
|
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||||
|
|
||||||
#define VENDOR_INTEL 1
|
#define VENDOR_INTEL 1
|
||||||
#define VENDOR_AMD 2
|
#define VENDOR_AMD 2
|
||||||
|
@ -92,7 +95,7 @@ int support_avx(){
|
||||||
int ret=0;
|
int ret=0;
|
||||||
|
|
||||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||||
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
|
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
|
||||||
xgetbv(0, &eax, &edx);
|
xgetbv(0, &eax, &edx);
|
||||||
if((eax & 6) == 6){
|
if((eax & 6) == 6){
|
||||||
ret=1; //OS support AVX
|
ret=1; //OS support AVX
|
||||||
|
@ -175,7 +178,7 @@ static gotoblas_t *get_coretype(void){
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_SANDYBRIDGE;
|
return &gotoblas_SANDYBRIDGE;
|
||||||
else{
|
else{
|
||||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -186,7 +189,27 @@ static gotoblas_t *get_coretype(void){
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_SANDYBRIDGE;
|
return &gotoblas_SANDYBRIDGE;
|
||||||
else{
|
else{
|
||||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||||
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//Intel Haswell
|
||||||
|
if (model == 12) {
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_HASWELL;
|
||||||
|
else{
|
||||||
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||||
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
case 4:
|
||||||
|
//Intel Haswell
|
||||||
|
if (model == 5) {
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_HASWELL;
|
||||||
|
else{
|
||||||
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -207,13 +230,23 @@ static gotoblas_t *get_coretype(void){
|
||||||
} else if (exfamily == 5) {
|
} else if (exfamily == 5) {
|
||||||
return &gotoblas_BOBCAT;
|
return &gotoblas_BOBCAT;
|
||||||
} else if (exfamily == 6) {
|
} else if (exfamily == 6) {
|
||||||
|
if(model == 1){
|
||||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_BULLDOZER;
|
return &gotoblas_BULLDOZER;
|
||||||
else{
|
else{
|
||||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n");
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
|
}else if(model == 2){
|
||||||
|
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_PILEDRIVER;
|
||||||
|
else{
|
||||||
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
|
||||||
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
return &gotoblas_BARCELONA;
|
return &gotoblas_BARCELONA;
|
||||||
}
|
}
|
||||||
|
@ -251,6 +284,7 @@ static char *corename[] = {
|
||||||
"Sandybridge",
|
"Sandybridge",
|
||||||
"Bobcat",
|
"Bobcat",
|
||||||
"Bulldozer",
|
"Bulldozer",
|
||||||
|
"Piledriver",
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
|
@ -273,6 +307,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||||
|
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||||
|
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,6 +82,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#include <dirent.h>
|
#include <dirent.h>
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#define MAX_NODES 16
|
#define MAX_NODES 16
|
||||||
#define MAX_CPUS 256
|
#define MAX_CPUS 256
|
||||||
|
@ -314,7 +316,7 @@ static int numa_check(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
while ((dir = readdir(dp)) != NULL) {
|
while ((dir = readdir(dp)) != NULL) {
|
||||||
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
|
if (strncmp(dir->d_name, "node", 4)==0) {
|
||||||
|
|
||||||
node = atoi(&dir -> d_name[4]);
|
node = atoi(&dir -> d_name[4]);
|
||||||
|
|
||||||
|
@ -735,7 +737,8 @@ void gotoblas_affinity_init(void) {
|
||||||
fprintf(stderr, "Shared Memory Initialization.\n");
|
fprintf(stderr, "Shared Memory Initialization.\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
common -> num_procs = get_nprocs();
|
//returns the number of processors which are currently online
|
||||||
|
common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);;
|
||||||
|
|
||||||
if(common -> num_procs > MAX_CPUS) {
|
if(common -> num_procs > MAX_CPUS) {
|
||||||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
|
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
|
||||||
|
|
|
@ -105,6 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
|
#include <sys/resource.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
|
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
|
||||||
|
@ -125,7 +126,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define NO_WARMUP
|
#define NO_WARMUP
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef ALLOC_HUGETLB
|
#ifndef SHM_HUGETLB
|
||||||
#define SHM_HUGETLB 04000
|
#define SHM_HUGETLB 04000
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -216,6 +217,25 @@ int get_num_procs(void) {
|
||||||
}
|
}
|
||||||
return nums;
|
return nums;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
void set_stack_limit(int limitMB){
|
||||||
|
int result=0;
|
||||||
|
struct rlimit rl;
|
||||||
|
rlim_t StackSize;
|
||||||
|
|
||||||
|
StackSize=limitMB*1024*1024;
|
||||||
|
result=getrlimit(RLIMIT_STACK, &rl);
|
||||||
|
if(result==0){
|
||||||
|
if(rl.rlim_cur < StackSize){
|
||||||
|
rl.rlim_cur=StackSize;
|
||||||
|
result=setrlimit(RLIMIT_STACK, &rl);
|
||||||
|
if(result !=0){
|
||||||
|
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1248,6 +1268,7 @@ void CONSTRUCTOR gotoblas_init(void) {
|
||||||
|
|
||||||
if (gotoblas_initialized) return;
|
if (gotoblas_initialized) return;
|
||||||
|
|
||||||
|
|
||||||
#ifdef PROFILE
|
#ifdef PROFILE
|
||||||
moncontrol (0);
|
moncontrol (0);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2013 Martin Koehler, grisuthedragon@users.github.com
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the ISCAS nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#if defined(USE_OPENMP)
|
||||||
|
static int parallel = 2 ;
|
||||||
|
#elif defined(SMP_SERVER)
|
||||||
|
static int parallel = 1;
|
||||||
|
#else
|
||||||
|
static int parallel = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int CNAME() {
|
||||||
|
return parallel;
|
||||||
|
}
|
||||||
|
|
||||||
|
int NAME() {
|
||||||
|
return parallel;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -89,7 +89,7 @@ else
|
||||||
endif
|
endif
|
||||||
|
|
||||||
libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
|
libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
|
||||||
$(CC) $(CFLAGS) libgoto2_shared.def -shared -o $(@F) \
|
$(CC) $(CFLAGS) $(LDFLAGS) libgoto2_shared.def -shared -o $(@F) \
|
||||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||||
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
|
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
|
||||||
|
|
||||||
|
@ -116,10 +116,15 @@ ifeq ($(OSNAME), Linux)
|
||||||
so : ../$(LIBSONAME)
|
so : ../$(LIBSONAME)
|
||||||
|
|
||||||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
||||||
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||||
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
ifneq ($(C_COMPILER), LSB)
|
||||||
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
|
else
|
||||||
|
#Use FC on LSB
|
||||||
|
$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
|
endif
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
@ -130,10 +135,10 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
|
||||||
so : ../$(LIBSONAME)
|
so : ../$(LIBSONAME)
|
||||||
|
|
||||||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
||||||
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||||
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
|
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
|
||||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
@ -143,15 +148,15 @@ ifeq ($(OSNAME), OSF1)
|
||||||
so : ../$(LIBSONAME)
|
so : ../$(LIBSONAME)
|
||||||
|
|
||||||
../$(LIBSONAME) :
|
../$(LIBSONAME) :
|
||||||
$(CC) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), SunOS)
|
ifeq ($(OSNAME), SunOS)
|
||||||
|
|
||||||
so : ../$(LIBSONAME)
|
so : ../$(LIBSONAME)
|
||||||
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB)
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB)
|
||||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
@ -194,7 +199,7 @@ symbol.S : gensymbol
|
||||||
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S
|
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S
|
||||||
|
|
||||||
test : linktest.c
|
test : linktest.c
|
||||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||||
|
|
|
@ -49,7 +49,7 @@
|
||||||
cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2,
|
cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2,
|
||||||
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
|
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
|
||||||
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
|
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
|
||||||
cblas_ztrsv);
|
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub );
|
||||||
|
|
||||||
@exblasobjs = (
|
@exblasobjs = (
|
||||||
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
|
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
|
||||||
|
@ -72,13 +72,18 @@
|
||||||
zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m,
|
zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
#both underscore and no underscore
|
||||||
|
@misc_common_objs = (
|
||||||
|
openblas_set_num_threads, openblas_get_parallel,
|
||||||
|
);
|
||||||
|
|
||||||
@misc_no_underscore_objs = (
|
@misc_no_underscore_objs = (
|
||||||
openblas_set_num_threads, goto_set_num_threads,
|
goto_set_num_threads,
|
||||||
openblas_get_config,
|
openblas_get_config,
|
||||||
);
|
);
|
||||||
|
|
||||||
@misc_underscore_objs = (
|
@misc_underscore_objs = (
|
||||||
openblas_set_num_threads,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
@lapackobjs = (
|
@lapackobjs = (
|
||||||
|
@ -111,7 +116,7 @@
|
||||||
# already provided by @blasobjs: xerbla, lsame
|
# already provided by @blasobjs: xerbla, lsame
|
||||||
ilaenv, ieeeck, lsamen, xerbla_array, iparmq,
|
ilaenv, ieeeck, lsamen, xerbla_array, iparmq,
|
||||||
ilaprec, ilatrans, ilauplo, iladiag, chla_transtype,
|
ilaprec, ilatrans, ilauplo, iladiag, chla_transtype,
|
||||||
ilaver, slamch,
|
ilaver, slamch, slamc3,
|
||||||
|
|
||||||
# SCLAUX -- Auxiliary routines called from both REAL and COMPLEX.
|
# SCLAUX -- Auxiliary routines called from both REAL and COMPLEX.
|
||||||
# excluded: second_$(TIMER)
|
# excluded: second_$(TIMER)
|
||||||
|
@ -148,7 +153,7 @@
|
||||||
dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc,
|
dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc,
|
||||||
dsteqr, dsterf, dlaisnan, disnan,
|
dsteqr, dsterf, dlaisnan, disnan,
|
||||||
dlartgp, dlartgs,
|
dlartgp, dlartgs,
|
||||||
dlamch,
|
dlamch, dlamc3,
|
||||||
|
|
||||||
# SLASRC -- Single precision real LAPACK routines
|
# SLASRC -- Single precision real LAPACK routines
|
||||||
# already provided by @lapackobjs:
|
# already provided by @lapackobjs:
|
||||||
|
@ -2671,7 +2676,7 @@ if ($ARGV[5] == 1) {
|
||||||
#NO_LAPACK=1
|
#NO_LAPACK=1
|
||||||
@underscore_objs = (@blasobjs, @misc_underscore_objs);
|
@underscore_objs = (@blasobjs, @misc_underscore_objs);
|
||||||
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" ||
|
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" ||
|
||||||
-d "../lapack-3.4.2") {
|
-d "../lapack-3.4.2" || -d "../lapack-netlib") {
|
||||||
@underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs);
|
@underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs);
|
||||||
} else {
|
} else {
|
||||||
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
|
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
|
||||||
|
@ -2679,7 +2684,7 @@ if ($ARGV[5] == 1) {
|
||||||
|
|
||||||
if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); };
|
if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); };
|
||||||
|
|
||||||
if ($ARGV[1] eq "X86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
if ($ARGV[1] eq "x86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||||
|
|
||||||
if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
|
||||||
|
|
||||||
|
@ -2716,6 +2721,10 @@ $bu = $ARGV[2];
|
||||||
$bu = "" if (($bu eq "0") || ($bu eq "1"));
|
$bu = "" if (($bu eq "0") || ($bu eq "1"));
|
||||||
|
|
||||||
if ($ARGV[0] eq "linux"){
|
if ($ARGV[0] eq "linux"){
|
||||||
|
|
||||||
|
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||||
|
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||||
|
|
||||||
foreach $objs (@underscore_objs) {
|
foreach $objs (@underscore_objs) {
|
||||||
print $objs, $bu, "\n";
|
print $objs, $bu, "\n";
|
||||||
}
|
}
|
||||||
|
@ -2733,6 +2742,10 @@ if ($ARGV[0] eq "linux"){
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($ARGV[0] eq "osx"){
|
if ($ARGV[0] eq "osx"){
|
||||||
|
|
||||||
|
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||||
|
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||||
|
|
||||||
foreach $objs (@underscore_objs) {
|
foreach $objs (@underscore_objs) {
|
||||||
print "_", $objs, $bu, "\n";
|
print "_", $objs, $bu, "\n";
|
||||||
}
|
}
|
||||||
|
@ -2746,6 +2759,10 @@ if ($ARGV[0] eq "osx"){
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($ARGV[0] eq "aix"){
|
if ($ARGV[0] eq "aix"){
|
||||||
|
|
||||||
|
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||||
|
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||||
|
|
||||||
foreach $objs (@underscore_objs) {
|
foreach $objs (@underscore_objs) {
|
||||||
print $objs, $bu, "\n";
|
print $objs, $bu, "\n";
|
||||||
}
|
}
|
||||||
|
@ -2762,8 +2779,9 @@ if ($ARGV[0] eq "win2k"){
|
||||||
print "EXPORTS\n";
|
print "EXPORTS\n";
|
||||||
$count = 1;
|
$count = 1;
|
||||||
|
|
||||||
|
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||||
|
|
||||||
foreach $objs (@underscore_objs) {
|
foreach $objs (@underscore_objs) {
|
||||||
unless ($objs =~ /openblas_set_num_threads/) { #remove openblas_set_num_threads
|
|
||||||
$uppercase = $objs;
|
$uppercase = $objs;
|
||||||
$uppercase =~ tr/[a-z]/[A-Z]/;
|
$uppercase =~ tr/[a-z]/[A-Z]/;
|
||||||
print "\t$objs=$objs","_ \@", $count, "\n";
|
print "\t$objs=$objs","_ \@", $count, "\n";
|
||||||
|
@ -2773,11 +2791,18 @@ if ($ARGV[0] eq "win2k"){
|
||||||
print "\t$uppercase=$objs", "_ \@", $count, "\n";
|
print "\t$uppercase=$objs", "_ \@", $count, "\n";
|
||||||
$count ++;
|
$count ++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#for misc_common_objs
|
||||||
|
foreach $objs (@misc_common_objs) {
|
||||||
|
|
||||||
|
$uppercase = $objs;
|
||||||
|
$uppercase =~ tr/[a-z]/[A-Z]/;
|
||||||
|
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
|
||||||
|
$count ++;
|
||||||
|
print "\t$uppercase=$objs", "_ \@", $count, "\n";
|
||||||
|
$count ++;
|
||||||
}
|
}
|
||||||
|
|
||||||
#for openblas_set_num_threads
|
|
||||||
print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n";
|
|
||||||
$count ++;
|
|
||||||
|
|
||||||
foreach $objs (@no_underscore_objs) {
|
foreach $objs (@no_underscore_objs) {
|
||||||
print "\t",$objs,"=$objs"," \@", $count, "\n";
|
print "\t",$objs,"=$objs"," \@", $count, "\n";
|
||||||
|
@ -2810,6 +2835,9 @@ if ($ARGV[0] eq "win2khpl"){
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($ARGV[0] eq "microsoft"){
|
if ($ARGV[0] eq "microsoft"){
|
||||||
|
|
||||||
|
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||||
|
|
||||||
print "EXPORTS\n";
|
print "EXPORTS\n";
|
||||||
$count = 1;
|
$count = 1;
|
||||||
foreach $objs (@underscore_objs) {
|
foreach $objs (@underscore_objs) {
|
||||||
|
@ -2828,6 +2856,9 @@ if ($ARGV[0] eq "microsoft"){
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($ARGV[0] eq "win2kasm"){
|
if ($ARGV[0] eq "win2kasm"){
|
||||||
|
|
||||||
|
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||||
|
|
||||||
print "\t.text\n";
|
print "\t.text\n";
|
||||||
foreach $objs (@underscore_objs) {
|
foreach $objs (@underscore_objs) {
|
||||||
$uppercase = $objs;
|
$uppercase = $objs;
|
||||||
|
@ -2841,6 +2872,10 @@ if ($ARGV[0] eq "win2kasm"){
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($ARGV[0] eq "linktest"){
|
if ($ARGV[0] eq "linktest"){
|
||||||
|
|
||||||
|
@underscore_objs = (@underscore_objs, @misc_common_objs);
|
||||||
|
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
|
||||||
|
|
||||||
print "int main(void){\n";
|
print "int main(void){\n";
|
||||||
foreach $objs (@underscore_objs) {
|
foreach $objs (@underscore_objs) {
|
||||||
print $objs, $bu, "();\n" if $objs ne "xerbla";
|
print $objs, $bu, "();\n" if $objs ne "xerbla";
|
||||||
|
|
2
f_check
2
f_check
|
@ -24,7 +24,7 @@ $compiler = "" if $compiler eq "f77";
|
||||||
|
|
||||||
if ($compiler eq "") {
|
if ($compiler eq "") {
|
||||||
|
|
||||||
@lists = ("f77", "g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
|
@lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
|
||||||
"sunf77", "sunf90", "sunf95",
|
"sunf77", "sunf90", "sunf95",
|
||||||
"xlf95", "xlf90", "xlf",
|
"xlf95", "xlf90", "xlf",
|
||||||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||||
|
|
59
getarch.c
59
getarch.c
|
@ -83,6 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
#ifdef linux
|
#ifdef linux
|
||||||
#include <sys/sysinfo.h>
|
#include <sys/sysinfo.h>
|
||||||
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* #define FORCE_P2 */
|
/* #define FORCE_P2 */
|
||||||
|
@ -96,14 +97,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/* #define FORCE_PENRYN */
|
/* #define FORCE_PENRYN */
|
||||||
/* #define FORCE_DUNNINGTON */
|
/* #define FORCE_DUNNINGTON */
|
||||||
/* #define FORCE_NEHALEM */
|
/* #define FORCE_NEHALEM */
|
||||||
|
/* #define FORCE_SANDYBRIDGE */
|
||||||
|
/* #define FORCE_ATOM */
|
||||||
/* #define FORCE_ATHLON */
|
/* #define FORCE_ATHLON */
|
||||||
/* #define FORCE_OPTERON */
|
/* #define FORCE_OPTERON */
|
||||||
/* #define FORCE_OPTERON_SSE3 */
|
/* #define FORCE_OPTERON_SSE3 */
|
||||||
/* #define FORCE_BARCELONA */
|
/* #define FORCE_BARCELONA */
|
||||||
/* #define FORCE_SHANGHAI */
|
/* #define FORCE_SHANGHAI */
|
||||||
/* #define FORCE_ISTANBUL */
|
/* #define FORCE_ISTANBUL */
|
||||||
/* #define FORCE_BULLDOZER */
|
|
||||||
/* #define FORCE_BOBCAT */
|
/* #define FORCE_BOBCAT */
|
||||||
|
/* #define FORCE_BULLDOZER */
|
||||||
|
/* #define FORCE_PILEDRIVER */
|
||||||
/* #define FORCE_SSE_GENERIC */
|
/* #define FORCE_SSE_GENERIC */
|
||||||
/* #define FORCE_VIAC3 */
|
/* #define FORCE_VIAC3 */
|
||||||
/* #define FORCE_NANO */
|
/* #define FORCE_NANO */
|
||||||
|
@ -121,9 +125,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/* #define FORCE_LOONGSON3A */
|
/* #define FORCE_LOONGSON3A */
|
||||||
/* #define FORCE_LOONGSON3B */
|
/* #define FORCE_LOONGSON3B */
|
||||||
/* #define FORCE_ITANIUM2 */
|
/* #define FORCE_ITANIUM2 */
|
||||||
/* #define FORCE_GENERIC */
|
|
||||||
/* #define FORCE_SPARC */
|
/* #define FORCE_SPARC */
|
||||||
/* #define FORCE_SPARCV7 */
|
/* #define FORCE_SPARCV7 */
|
||||||
|
/* #define FORCE_GENERIC */
|
||||||
|
|
||||||
#ifdef FORCE_P2
|
#ifdef FORCE_P2
|
||||||
#define FORCE
|
#define FORCE
|
||||||
|
@ -139,20 +143,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "P5"
|
#define CORENAME "P5"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_COPPERMINE
|
|
||||||
#define FORCE
|
|
||||||
#define FORCE_INTEL
|
|
||||||
#define ARCHITECTURE "X86"
|
|
||||||
#define SUBARCHITECTURE "PENTIUM3"
|
|
||||||
#define ARCHCONFIG "-DPENTIUM3 " \
|
|
||||||
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
|
|
||||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
|
||||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
|
||||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
|
|
||||||
#define LIBNAME "coppermine"
|
|
||||||
#define CORENAME "COPPERMINE"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef FORCE_KATMAI
|
#ifdef FORCE_KATMAI
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
@ -167,6 +157,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "KATMAI"
|
#define CORENAME "KATMAI"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_COPPERMINE
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "PENTIUM3"
|
||||||
|
#define ARCHCONFIG "-DPENTIUM3 " \
|
||||||
|
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
|
||||||
|
#define LIBNAME "coppermine"
|
||||||
|
#define CORENAME "COPPERMINE"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_NORTHWOOD
|
#ifdef FORCE_NORTHWOOD
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
@ -396,6 +400,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "BULLDOZER"
|
#define CORENAME "BULLDOZER"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined (FORCE_PILEDRIVER)
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "PILEDRIVER"
|
||||||
|
#define ARCHCONFIG "-DPILEDRIVER " \
|
||||||
|
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
|
||||||
|
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
|
||||||
|
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
|
||||||
|
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
|
||||||
|
#define LIBNAME "piledriver"
|
||||||
|
#define CORENAME "PILEDRIVER"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_SSE_GENERIC
|
#ifdef FORCE_SSE_GENERIC
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
@ -717,7 +737,8 @@ static int get_num_cores(void) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#ifdef linux
|
||||||
return get_nprocs();
|
//returns the number of processors which are currently online
|
||||||
|
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||||
|
|
||||||
#elif defined(OS_WINDOWS)
|
#elif defined(OS_WINDOWS)
|
||||||
|
|
||||||
|
@ -802,8 +823,12 @@ int main(int argc, char *argv[]){
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if NO_PARALLEL_MAKE==1
|
||||||
|
printf("MAKE += -j 1\n");
|
||||||
|
#else
|
||||||
#ifndef OS_WINDOWS
|
#ifndef OS_WINDOWS
|
||||||
printf("MAKE += -j %d\n", get_num_cores());
|
printf("MAKE += -j %d\n", get_num_cores());
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
|
||||||
if ((argc < 1) || (*argv[1] == '0')) {
|
if ( (argc <= 1) || (argc >= 2) && (*argv[1] == '0')) {
|
||||||
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
|
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
|
||||||
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
|
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
|
||||||
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
|
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
|
||||||
|
@ -22,10 +22,48 @@ int main(int argc, char **argv) {
|
||||||
printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N);
|
printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N);
|
||||||
printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M);
|
printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M);
|
||||||
printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N);
|
printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N);
|
||||||
|
|
||||||
|
#ifdef CGEMM3M_DEFAULT_UNROLL_M
|
||||||
|
printf("CGEMM3M_UNROLL_M=%d\n", CGEMM3M_DEFAULT_UNROLL_M);
|
||||||
|
#else
|
||||||
|
printf("CGEMM3M_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef CGEMM3M_DEFAULT_UNROLL_N
|
||||||
|
printf("CGEMM3M_UNROLL_N=%d\n", CGEMM3M_DEFAULT_UNROLL_N);
|
||||||
|
#else
|
||||||
|
printf("CGEMM3M_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ZGEMM3M_DEFAULT_UNROLL_M
|
||||||
|
printf("ZGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
|
||||||
|
#else
|
||||||
|
printf("ZGEMM3M_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ZGEMM3M_DEFAULT_UNROLL_N
|
||||||
|
printf("ZGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
|
||||||
|
#else
|
||||||
|
printf("ZGEMM3M_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef XGEMM3M_DEFAULT_UNROLL_M
|
||||||
|
printf("XGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
|
||||||
|
#else
|
||||||
|
printf("XGEMM3M_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef XGEMM3M_DEFAULT_UNROLL_N
|
||||||
|
printf("XGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
|
||||||
|
#else
|
||||||
|
printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if ((argc >= 1) && (*argv[1] == '1')) {
|
if ((argc >= 2) && (*argv[1] == '1')) {
|
||||||
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
|
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
|
||||||
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
|
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
|
||||||
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
|
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
|
||||||
|
|
|
@ -60,6 +60,8 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
extern void dtrtri_lapack_(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info);
|
||||||
|
|
||||||
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||||
|
|
||||||
blas_arg_t args;
|
blas_arg_t args;
|
||||||
|
@ -83,6 +85,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||||
TOUPPER(uplo_arg);
|
TOUPPER(uplo_arg);
|
||||||
TOUPPER(diag_arg);
|
TOUPPER(diag_arg);
|
||||||
|
|
||||||
|
|
||||||
uplo = -1;
|
uplo = -1;
|
||||||
if (uplo_arg == 'U') uplo = 0;
|
if (uplo_arg == 'U') uplo = 0;
|
||||||
if (uplo_arg == 'L') uplo = 1;
|
if (uplo_arg == 'L') uplo = 1;
|
||||||
|
@ -90,6 +93,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||||
if (diag_arg == 'U') diag = 0;
|
if (diag_arg == 'U') diag = 0;
|
||||||
if (diag_arg == 'N') diag = 1;
|
if (diag_arg == 'N') diag = 1;
|
||||||
|
|
||||||
|
|
||||||
info = 0;
|
info = 0;
|
||||||
if (args.lda < MAX(1,args.n)) info = 5;
|
if (args.lda < MAX(1,args.n)) info = 5;
|
||||||
if (args.n < 0) info = 3;
|
if (args.n < 0) info = 3;
|
||||||
|
@ -129,6 +133,15 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if DOUBLE
|
||||||
|
// double trtri_U single thread error
|
||||||
|
// call dtrtri from lapack for a walk around.
|
||||||
|
if(uplo==0){
|
||||||
|
dtrtri_lapack_(UPLO, DIAG, N, a, ldA, Info);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
|
@ -1206,328 +1206,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M
|
||||||
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
|
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
|
||||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
||||||
|
@ -2608,328 +2608,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_
|
||||||
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
|
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
|
||||||
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
|
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
|
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
|
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
|
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
|
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||||
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||||
|
|
||||||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
||||||
|
|
|
@ -826,6 +826,22 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef PILEDRIVER
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "Piledriver\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef NANO
|
#ifdef NANO
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
|
||||||
|
SGEMMINCOPY =
|
||||||
|
SGEMMITCOPY =
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
SGEMMINCOPYOBJ =
|
||||||
|
SGEMMITCOPYOBJ =
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||||
|
CGEMMINCOPY =
|
||||||
|
CGEMMITCOPY =
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ =
|
||||||
|
CGEMMITCOPYOBJ =
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
|
||||||
|
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
|
||||||
|
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
|
|
@ -101,10 +101,10 @@
|
||||||
#define Y 36 + STACKSIZE+ARGS(%esp)
|
#define Y 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
#define MMM 0+ARGS(%esp)
|
#define MMM 0+ARGS(%esp)
|
||||||
#define YY 4+ARGS(%esp)
|
#define YY 4+ARGS(%esp)
|
||||||
#define AA 8+ARGS(%esp)
|
#define AA 8+ARGS(%esp)
|
||||||
#define LDAX 12+ARGS(%esp)
|
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -153,8 +153,8 @@
|
||||||
|
|
||||||
movl YY,J
|
movl YY,J
|
||||||
movl J,Y
|
movl J,Y
|
||||||
movl STACK_LDA, LDA
|
|
||||||
|
|
||||||
|
movl STACK_LDA, LDA
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
|
|
||||||
|
@ -688,9 +688,9 @@
|
||||||
movl M,J
|
movl M,J
|
||||||
leal (,J,SIZE),%eax
|
leal (,J,SIZE),%eax
|
||||||
addl %eax,AA
|
addl %eax,AA
|
||||||
movl YY,J
|
movl STACK_INCY,INCY
|
||||||
addl %eax,J
|
imull INCY,%eax
|
||||||
movl J,YY
|
addl %eax,YY
|
||||||
jmp .L0t
|
jmp .L0t
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
|
|
|
@ -714,9 +714,9 @@
|
||||||
movl M,J
|
movl M,J
|
||||||
leal (,J,SIZE),%eax
|
leal (,J,SIZE),%eax
|
||||||
addl %eax,AA
|
addl %eax,AA
|
||||||
movl YY,J
|
movl STACK_INCY,INCY
|
||||||
addl %eax,J
|
imull INCY,%eax
|
||||||
movl J,YY
|
addl %eax,YY
|
||||||
jmp .L0t
|
jmp .L0t
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
|
|
|
@ -102,11 +102,9 @@
|
||||||
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
#define MMM 0+STACKSIZE(%esp)
|
#define MMM 0+ARGS(%esp)
|
||||||
#define NN 4+STACKSIZE(%esp)
|
#define AA 4+ARGS(%esp)
|
||||||
#define AA 8+STACKSIZE(%esp)
|
#define XX 8+ARGS(%esp)
|
||||||
#define LDAX 12+STACKSIZE(%esp)
|
|
||||||
#define XX 16+STACKSIZE(%esp)
|
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -129,12 +127,8 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
|
||||||
movl LDA,LDAX # backup LDA
|
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl X,XX
|
movl X,XX
|
||||||
movl N,J
|
|
||||||
movl J,NN # backup N
|
|
||||||
movl A,J
|
movl A,J
|
||||||
movl J,AA # backup A
|
movl J,AA # backup A
|
||||||
movl M,J
|
movl M,J
|
||||||
|
@ -144,7 +138,6 @@
|
||||||
addl $1,J
|
addl $1,J
|
||||||
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
|
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
|
||||||
subl $8, J # Don't use last 8 float in the buffer.
|
subl $8, J # Don't use last 8 float in the buffer.
|
||||||
# Now, split M by block J
|
|
||||||
subl J,MMM # MMM=MMM-J
|
subl J,MMM # MMM=MMM-J
|
||||||
movl J,M
|
movl J,M
|
||||||
jge .L00t
|
jge .L00t
|
||||||
|
@ -159,13 +152,10 @@
|
||||||
movl AA,%eax
|
movl AA,%eax
|
||||||
movl %eax,A # mov AA to A
|
movl %eax,A # mov AA to A
|
||||||
|
|
||||||
movl NN,%eax
|
movl XX,%eax
|
||||||
movl %eax,N # reset N
|
movl %eax,X
|
||||||
|
|
||||||
|
|
||||||
movl LDAX, LDA # reset LDA
|
|
||||||
movl XX,X
|
|
||||||
|
|
||||||
|
movl STACK_LDA, LDA
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
|
||||||
|
@ -688,9 +678,9 @@
|
||||||
movl M,J
|
movl M,J
|
||||||
leal (,J,SIZE),%eax
|
leal (,J,SIZE),%eax
|
||||||
addl %eax,AA
|
addl %eax,AA
|
||||||
movl XX,J
|
movl STACK_INCX,INCX
|
||||||
addl %eax,J
|
imull INCX,%eax
|
||||||
movl J,XX
|
addl %eax,XX
|
||||||
jmp .L0t
|
jmp .L0t
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
#define ARGS 16
|
#define ARGS 20
|
||||||
|
|
||||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
|
@ -89,10 +89,9 @@
|
||||||
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
#define MMM 0+STACKSIZE(%esp)
|
#define MMM 0+ARGS(%esp)
|
||||||
#define AA 4+STACKSIZE(%esp)
|
#define AA 4+ARGS(%esp)
|
||||||
#define LDAX 8+STACKSIZE(%esp)
|
#define XX 8+ARGS(%esp)
|
||||||
#define NN 12+STACKSIZE(%esp)
|
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -117,10 +116,8 @@
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_X, X
|
||||||
movl LDA,LDAX # backup LDA
|
movl X,XX
|
||||||
movl N,J
|
|
||||||
movl J,NN # backup N
|
|
||||||
movl A,J
|
movl A,J
|
||||||
movl J,AA # backup A
|
movl J,AA # backup A
|
||||||
movl M,J
|
movl M,J
|
||||||
|
@ -130,7 +127,6 @@
|
||||||
addl $1,J
|
addl $1,J
|
||||||
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
|
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
|
||||||
subl $4, J # Don't use last 4 double in the buffer.
|
subl $4, J # Don't use last 4 double in the buffer.
|
||||||
# Now, split M by block J
|
|
||||||
subl J,MMM # MMM=MMM-J
|
subl J,MMM # MMM=MMM-J
|
||||||
movl J,M
|
movl J,M
|
||||||
jge .L00t
|
jge .L00t
|
||||||
|
@ -142,15 +138,13 @@
|
||||||
movl %eax,M
|
movl %eax,M
|
||||||
|
|
||||||
.L00t:
|
.L00t:
|
||||||
|
movl XX,%eax
|
||||||
|
movl %eax, X
|
||||||
|
|
||||||
movl AA,%eax
|
movl AA,%eax
|
||||||
movl %eax,A # mov AA to A
|
movl %eax,A # mov AA to A
|
||||||
|
|
||||||
movl NN,%eax
|
movl STACK_LDA, LDA
|
||||||
movl %eax,N # reset N
|
|
||||||
|
|
||||||
|
|
||||||
movl LDAX, LDA # reset LDA
|
|
||||||
movl STACK_X, X
|
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
|
||||||
|
@ -605,6 +599,9 @@
|
||||||
movl M,J
|
movl M,J
|
||||||
leal (,J,SIZE),%eax
|
leal (,J,SIZE),%eax
|
||||||
addl %eax,AA
|
addl %eax,AA
|
||||||
|
movl STACK_INCX,INCX
|
||||||
|
imull INCX,%eax
|
||||||
|
addl %eax,XX
|
||||||
jmp .L0t
|
jmp .L0t
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
|
|
|
@ -74,11 +74,11 @@
|
||||||
#else
|
#else
|
||||||
movl %eax, %ecx
|
movl %eax, %ecx
|
||||||
subl $32, %ecx
|
subl $32, %ecx
|
||||||
cmovg %ecx, %eax
|
cmovge %ecx, %eax
|
||||||
|
|
||||||
movl %edx, %ecx
|
movl %edx, %ecx
|
||||||
subl $32, %ecx
|
subl $32, %ecx
|
||||||
cmovg %ecx, %edx
|
cmovge %ecx, %edx
|
||||||
|
|
||||||
subl %eax, %edx
|
subl %eax, %edx
|
||||||
movl $0, %eax
|
movl $0, %eax
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -488,7 +488,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1697,7 +1697,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1727,7 +1727,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -437,7 +437,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -833,7 +833,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1848,7 +1848,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2109,7 +2109,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2429,7 +2429,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2459,7 +2459,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2952,7 +2952,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -3148,7 +3148,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3389,7 +3389,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -3404,7 +3404,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -910,7 +910,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -959,7 +959,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1439,7 +1439,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1469,7 +1469,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -872,7 +872,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1316,7 +1316,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1855,7 +1855,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1885,7 +1885,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2249,7 +2249,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2562,7 +2562,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2957,7 +2957,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -2972,7 +2972,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -3280,7 +3280,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3515,7 +3515,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -1036,7 +1036,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1066,7 +1066,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
@ -2224,7 +2224,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -2273,7 +2273,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -454,7 +454,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -758,7 +758,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -993,7 +993,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -1324,7 +1324,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1354,7 +1354,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -1718,7 +1718,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2031,7 +2031,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2859,7 +2859,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -3303,7 +3303,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
|
|
@ -89,18 +89,23 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 20
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA_I 20 + STACKSIZE(%esp)
|
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 24 + STACKSIZE(%esp)
|
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 32 + STACKSIZE(%esp)
|
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 40 + STACKSIZE(%esp)
|
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 48 + STACKSIZE(%esp)
|
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0+ARGS(%esp)
|
||||||
|
#define YY 4+ARGS(%esp)
|
||||||
|
#define AA 8+ARGS(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -123,6 +128,7 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -130,6 +136,33 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
|
movl Y,J
|
||||||
|
movl J,YY
|
||||||
|
movl A,J
|
||||||
|
movl J,AA
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $20,J
|
||||||
|
subl J,MMM
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A
|
||||||
|
|
||||||
|
movl YY,J
|
||||||
|
movl J,Y
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
|
@ -595,10 +628,21 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,%eax
|
||||||
|
sall $ZBASE_SHIFT,%eax
|
||||||
|
addl %eax,AA
|
||||||
|
movl STACK_INCY,INCY
|
||||||
|
imull INCY,%eax
|
||||||
|
addl %eax,YY
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -76,18 +76,23 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 16
|
||||||
|
|
||||||
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define A 32 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define Y 48 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define MMM 0 + ARGS(%esp)
|
||||||
|
#define YY 4 + ARGS(%esp)
|
||||||
|
#define AA 8 + ARGS(%esp)
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
|
||||||
#define N 8 + STACKSIZE(%esp)
|
|
||||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
|
||||||
#define ALPHA_I 24 + STACKSIZE(%esp)
|
|
||||||
#define A 32 + STACKSIZE(%esp)
|
|
||||||
#define STACK_LDA 36 + STACKSIZE(%esp)
|
|
||||||
#define STACK_X 40 + STACKSIZE(%esp)
|
|
||||||
#define STACK_INCX 44 + STACKSIZE(%esp)
|
|
||||||
#define Y 48 + STACKSIZE(%esp)
|
|
||||||
#define STACK_INCY 52 + STACKSIZE(%esp)
|
|
||||||
#define BUFFER 56 + STACKSIZE(%esp)
|
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -110,6 +115,7 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -117,6 +123,33 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
|
movl Y,J
|
||||||
|
movl J,YY
|
||||||
|
movl A,J
|
||||||
|
movl J,AA
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $18,J
|
||||||
|
subl J,MMM
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A
|
||||||
|
|
||||||
|
movl YY,J
|
||||||
|
movl J,Y
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
|
@ -458,10 +491,21 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,%eax
|
||||||
|
sall $ZBASE_SHIFT,%eax
|
||||||
|
addl %eax,AA
|
||||||
|
movl STACK_INCY,INCY
|
||||||
|
imull INCY,%eax
|
||||||
|
addl %eax,YY
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -89,18 +89,23 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 20
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA_I 20 + STACKSIZE(%esp)
|
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 24 + STACKSIZE(%esp)
|
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 32 + STACKSIZE(%esp)
|
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 40 + STACKSIZE(%esp)
|
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 48 + STACKSIZE(%esp)
|
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0+ARGS(%esp)
|
||||||
|
#define XX 4+ARGS(%esp)
|
||||||
|
#define AA 8+ARGS(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -123,6 +128,7 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -130,8 +136,35 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
|
movl X,XX
|
||||||
|
movl A,J
|
||||||
|
movl J,AA #backup A
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $20,J
|
||||||
|
subl $8,J
|
||||||
|
subl J,MMM #MMM-=J
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A
|
||||||
|
|
||||||
|
movl XX,%eax
|
||||||
|
movl %eax,X
|
||||||
|
|
||||||
|
movl STACK_LDA,LDA
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
|
||||||
|
@ -513,10 +546,22 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,%eax
|
||||||
|
sall $ZBASE_SHIFT, %eax
|
||||||
|
addl %eax,AA
|
||||||
|
movl STACK_INCX,INCX
|
||||||
|
imull INCX,%eax
|
||||||
|
addl %eax,XX
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -76,18 +76,23 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 20
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA_R 16 + STACKSIZE(%esp)
|
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA_I 24 + STACKSIZE(%esp)
|
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 32 + STACKSIZE(%esp)
|
#define A 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 36 + STACKSIZE(%esp)
|
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 40 + STACKSIZE(%esp)
|
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 44 + STACKSIZE(%esp)
|
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 48 + STACKSIZE(%esp)
|
#define Y 48 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 52 + STACKSIZE(%esp)
|
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 56 + STACKSIZE(%esp)
|
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0 + ARGS(%esp)
|
||||||
|
#define AA 4 + ARGS(%esp)
|
||||||
|
#define XX 8 + ARGS(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -110,6 +115,7 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -117,8 +123,35 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
|
movl X, XX
|
||||||
|
movl A,J
|
||||||
|
movl J,AA
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $18,J
|
||||||
|
subl $4,J
|
||||||
|
subl J,MMM
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax, M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl XX, %eax
|
||||||
|
movl %eax, X
|
||||||
|
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A
|
||||||
|
|
||||||
|
movl STACK_LDA, LDA
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
|
||||||
|
@ -188,7 +221,7 @@
|
||||||
movl Y, Y1
|
movl Y, Y1
|
||||||
|
|
||||||
movl N, J
|
movl N, J
|
||||||
ALIGN_3
|
ALIGN_4
|
||||||
|
|
||||||
.L11:
|
.L11:
|
||||||
movl BUFFER, X
|
movl BUFFER, X
|
||||||
|
@ -395,10 +428,21 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,%eax
|
||||||
|
sall $ZBASE_SHIFT,%eax
|
||||||
|
addl %eax,AA
|
||||||
|
movl STACK_INCX,INCX
|
||||||
|
imull INCX,%eax
|
||||||
|
addl %eax,XX
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -533,7 +533,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -994,7 +994,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -1820,7 +1820,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -1,62 +1,71 @@
|
||||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_8x4_bulldozer.S
|
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
DAXPYKERNEL = daxpy_bulldozer.S
|
||||||
SGEMMONCOPY = gemm_ncopy_4_opteron.S
|
DDOTKERNEL = ddot_bulldozer.S
|
||||||
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
DCOPYKERNEL = dcopy_bulldozer.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
|
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
|
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
|
||||||
DGEMMINCOPY =
|
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
|
||||||
DGEMMITCOPY =
|
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
|
||||||
DGEMMONCOPY = gemm_ncopy_4_opteron.S
|
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
DGEMMINCOPYOBJ =
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMITCOPYOBJ =
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
|
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
CGEMMONCOPY = zgemm_ncopy_2.S
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
CGEMMOTCOPY = zgemm_tcopy_2.S
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
|
||||||
ZGEMMINCOPY =
|
ZGEMMINCOPY =
|
||||||
ZGEMMITCOPY =
|
ZGEMMITCOPY =
|
||||||
ZGEMMONCOPY = zgemm_ncopy_2.S
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = zgemm_tcopy_2.S
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMINCOPYOBJ =
|
ZGEMMINCOPYOBJ =
|
||||||
ZGEMMITCOPYOBJ =
|
ZGEMMITCOPYOBJ =
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
|
|
||||||
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
|
|
||||||
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
|
|
||||||
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
|
|
||||||
|
|
||||||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
|
|
||||||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
|
|
||||||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
|
|
||||||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
|
|
||||||
|
|
||||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
|
|
||||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
|
|
||||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
|
|
||||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
|
|
||||||
|
|
||||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
|
|
||||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
|
|
||||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
|
|
||||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
|
|
||||||
|
|
||||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
|
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||||
|
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||||
|
DAXPYKERNEL = daxpy_bulldozer.S
|
||||||
|
DDOTKERNEL = ddot_bulldozer.S
|
||||||
|
DCOPYKERNEL = dcopy_bulldozer.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
|
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
|
||||||
|
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
|
||||||
|
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
|
||||||
|
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
|
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
|
||||||
|
ZGEMMINCOPY =
|
||||||
|
ZGEMMITCOPY =
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPYOBJ =
|
||||||
|
ZGEMMITCOPYOBJ =
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#endif
|
#endif
|
||||||
movaps %xmm0, ALPHA
|
movaps %xmm0, ALPHA
|
||||||
#else
|
#else
|
||||||
movaps %xmm3, ALPHA
|
|
||||||
|
|
||||||
movq 40(%rsp), X
|
movq 40(%rsp), X
|
||||||
movq 48(%rsp), INCX
|
movq 48(%rsp), INCX
|
||||||
|
@ -79,6 +79,10 @@
|
||||||
|
|
||||||
SAVEREGISTERS
|
SAVEREGISTERS
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
movaps %xmm3, ALPHA
|
||||||
|
#endif
|
||||||
|
|
||||||
shufps $0, ALPHA, ALPHA
|
shufps $0, ALPHA, ALPHA
|
||||||
|
|
||||||
leaq (, INCX, SIZE), INCX
|
leaq (, INCX, SIZE), INCX
|
||||||
|
|
|
@ -69,7 +69,6 @@
|
||||||
#endif
|
#endif
|
||||||
movaps %xmm0, ALPHA
|
movaps %xmm0, ALPHA
|
||||||
#else
|
#else
|
||||||
movaps %xmm3, ALPHA
|
|
||||||
|
|
||||||
movq 40(%rsp), X
|
movq 40(%rsp), X
|
||||||
movq 48(%rsp), INCX
|
movq 48(%rsp), INCX
|
||||||
|
@ -79,6 +78,10 @@
|
||||||
|
|
||||||
SAVEREGISTERS
|
SAVEREGISTERS
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
movaps %xmm3, ALPHA
|
||||||
|
#endif
|
||||||
|
|
||||||
unpcklpd ALPHA, ALPHA
|
unpcklpd ALPHA, ALPHA
|
||||||
|
|
||||||
leaq (, INCX, SIZE), INCX
|
leaq (, INCX, SIZE), INCX
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -47,7 +47,7 @@
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define STACKSIZE 64
|
#define STACKSIZE 128
|
||||||
|
|
||||||
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
||||||
#define OLD_Y 16 + STACKSIZE(%rsp)
|
#define OLD_Y 16 + STACKSIZE(%rsp)
|
||||||
|
@ -55,6 +55,14 @@
|
||||||
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 48 (%rsp)
|
#define ALPHA 48 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 64(%rsp)
|
||||||
|
#define NN 72(%rsp)
|
||||||
|
#define AA 80(%rsp)
|
||||||
|
#define XX 88(%rsp)
|
||||||
|
#define LDAX 96(%rsp)
|
||||||
|
#define ALPHAR 104(%rsp)
|
||||||
|
#define ALPHAI 112(%rsp)
|
||||||
|
|
||||||
#define M %rdi
|
#define M %rdi
|
||||||
#define N %rsi
|
#define N %rsi
|
||||||
#define A %rcx
|
#define A %rcx
|
||||||
|
@ -66,7 +74,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define STACKSIZE 256
|
#define STACKSIZE 288
|
||||||
|
|
||||||
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
||||||
#define OLD_A 48 + STACKSIZE(%rsp)
|
#define OLD_A 48 + STACKSIZE(%rsp)
|
||||||
|
@ -78,6 +86,14 @@
|
||||||
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
|
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 224 (%rsp)
|
#define ALPHA 224 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 232(%rsp)
|
||||||
|
#define NN 240(%rsp)
|
||||||
|
#define AA 248(%rsp)
|
||||||
|
#define XX 256(%rsp)
|
||||||
|
#define LDAX 264(%rsp)
|
||||||
|
#define ALPHAR 272(%rsp)
|
||||||
|
#define ALPHAI 280(%rsp)
|
||||||
|
|
||||||
#define M %rcx
|
#define M %rcx
|
||||||
#define N %rdx
|
#define N %rdx
|
||||||
#define A %r8
|
#define A %r8
|
||||||
|
@ -142,9 +158,37 @@
|
||||||
movaps %xmm3, %xmm0
|
movaps %xmm3, %xmm0
|
||||||
movss OLD_ALPHA_I, %xmm1
|
movss OLD_ALPHA_I, %xmm1
|
||||||
#endif
|
#endif
|
||||||
|
movq A, AA
|
||||||
|
movq N, NN
|
||||||
|
movq M, MMM
|
||||||
|
movq LDA, LDAX
|
||||||
|
movq X, XX
|
||||||
|
movq OLD_Y, Y
|
||||||
|
movss %xmm0,ALPHAR
|
||||||
|
movss %xmm1,ALPHAI
|
||||||
|
|
||||||
|
.L0t:
|
||||||
|
xorq I,I
|
||||||
|
addq $1,I
|
||||||
|
salq $20,I
|
||||||
|
subq I,MMM
|
||||||
|
movq I,M
|
||||||
|
movss ALPHAR,%xmm0
|
||||||
|
movss ALPHAI,%xmm1
|
||||||
|
jge .L00t
|
||||||
|
|
||||||
|
movq MMM,M
|
||||||
|
addq I,M
|
||||||
|
jle .L999x
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movq AA, A
|
||||||
|
movq NN, N
|
||||||
|
movq LDAX, LDA
|
||||||
|
movq XX, X
|
||||||
|
|
||||||
movq OLD_INCX, INCX
|
movq OLD_INCX, INCX
|
||||||
movq OLD_Y, Y
|
# movq OLD_Y, Y
|
||||||
movq OLD_INCY, INCY
|
movq OLD_INCY, INCY
|
||||||
movq OLD_BUFFER, BUFFER
|
movq OLD_BUFFER, BUFFER
|
||||||
|
|
||||||
|
@ -4274,6 +4318,11 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movq M, I
|
||||||
|
salq $ZBASE_SHIFT,I
|
||||||
|
addq I,AA
|
||||||
|
jmp .L0t
|
||||||
|
.L999x:
|
||||||
movq 0(%rsp), %rbx
|
movq 0(%rsp), %rbx
|
||||||
movq 8(%rsp), %rbp
|
movq 8(%rsp), %rbp
|
||||||
movq 16(%rsp), %r12
|
movq 16(%rsp), %r12
|
||||||
|
|
|
@ -47,13 +47,19 @@
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define STACKSIZE 64
|
#define STACKSIZE 128
|
||||||
|
|
||||||
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
||||||
#define OLD_Y 16 + STACKSIZE(%rsp)
|
#define OLD_Y 16 + STACKSIZE(%rsp)
|
||||||
#define OLD_INCY 24 + STACKSIZE(%rsp)
|
#define OLD_INCY 24 + STACKSIZE(%rsp)
|
||||||
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 48 (%rsp)
|
#define ALPHA 48 (%rsp)
|
||||||
|
#define MMM 64(%rsp)
|
||||||
|
#define NN 72(%rsp)
|
||||||
|
#define AA 80(%rsp)
|
||||||
|
#define LDAX 88(%rsp)
|
||||||
|
#define ALPHAR 96(%rsp)
|
||||||
|
#define ALPHAI 104(%rsp)
|
||||||
|
|
||||||
#define M %rdi
|
#define M %rdi
|
||||||
#define N %rsi
|
#define N %rsi
|
||||||
|
@ -66,7 +72,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define STACKSIZE 256
|
#define STACKSIZE 288
|
||||||
|
|
||||||
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
||||||
#define OLD_A 48 + STACKSIZE(%rsp)
|
#define OLD_A 48 + STACKSIZE(%rsp)
|
||||||
|
@ -78,6 +84,13 @@
|
||||||
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
|
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 224 (%rsp)
|
#define ALPHA 224 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 232(%rsp)
|
||||||
|
#define NN 240(%rsp)
|
||||||
|
#define AA 248(%rsp)
|
||||||
|
#define LDAX 256(%rsp)
|
||||||
|
#define ALPHAR 264(%rsp)
|
||||||
|
#define ALPHAI 272(%rsp)
|
||||||
|
|
||||||
#define M %rcx
|
#define M %rcx
|
||||||
#define N %rdx
|
#define N %rdx
|
||||||
#define A %r8
|
#define A %r8
|
||||||
|
@ -144,6 +157,32 @@
|
||||||
movss OLD_ALPHA_I, %xmm1
|
movss OLD_ALPHA_I, %xmm1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
movq A, AA
|
||||||
|
movq N, NN
|
||||||
|
movq M, MMM
|
||||||
|
movq LDA, LDAX
|
||||||
|
movss %xmm0,ALPHAR
|
||||||
|
movss %xmm1,ALPHAI
|
||||||
|
|
||||||
|
.L0t:
|
||||||
|
xorq I,I
|
||||||
|
addq $1,I
|
||||||
|
salq $20,I
|
||||||
|
subq I,MMM
|
||||||
|
movq I,M
|
||||||
|
movss ALPHAR,%xmm0
|
||||||
|
movss ALPHAI,%xmm1
|
||||||
|
jge .L00t
|
||||||
|
|
||||||
|
movq MMM,M
|
||||||
|
addq I,M
|
||||||
|
jle .L999x
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movq AA, A
|
||||||
|
movq NN, N
|
||||||
|
movq LDAX, LDA
|
||||||
|
|
||||||
movq OLD_INCX, INCX
|
movq OLD_INCX, INCX
|
||||||
movq OLD_Y, Y
|
movq OLD_Y, Y
|
||||||
movq OLD_INCY, INCY
|
movq OLD_INCY, INCY
|
||||||
|
@ -4350,6 +4389,11 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movq M, I
|
||||||
|
salq $ZBASE_SHIFT,I
|
||||||
|
addq I,AA
|
||||||
|
jmp .L0t
|
||||||
|
.L999x:
|
||||||
movq 0(%rsp), %rbx
|
movq 0(%rsp), %rbx
|
||||||
movq 8(%rsp), %rbp
|
movq 8(%rsp), %rbp
|
||||||
movq 16(%rsp), %r12
|
movq 16(%rsp), %r12
|
||||||
|
|
|
@ -0,0 +1,408 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifndef WINDOWS_ABI
|
||||||
|
#define M ARG1
|
||||||
|
#define X ARG4
|
||||||
|
#define INCX ARG5
|
||||||
|
#define Y ARG6
|
||||||
|
#define INCY ARG2
|
||||||
|
#else
|
||||||
|
#define M ARG1
|
||||||
|
#define X ARG2
|
||||||
|
#define INCX ARG3
|
||||||
|
#define Y ARG4
|
||||||
|
#define INCY %r10
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define YY %r11
|
||||||
|
#define ALPHA %xmm15
|
||||||
|
|
||||||
|
#define A_PRE 640
|
||||||
|
|
||||||
|
#include "l1param.h"
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#ifndef WINDOWS_ABI
|
||||||
|
#ifndef XDOUBLE
|
||||||
|
movq 8(%rsp), INCY
|
||||||
|
#else
|
||||||
|
movq 24(%rsp), INCY
|
||||||
|
#endif
|
||||||
|
vmovups %xmm0, ALPHA
|
||||||
|
#else
|
||||||
|
vmovups %xmm3, ALPHA
|
||||||
|
|
||||||
|
movq 40(%rsp), X
|
||||||
|
movq 48(%rsp), INCX
|
||||||
|
movq 56(%rsp), Y
|
||||||
|
movq 64(%rsp), INCY
|
||||||
|
#endif
|
||||||
|
|
||||||
|
SAVEREGISTERS
|
||||||
|
|
||||||
|
unpcklpd ALPHA, ALPHA
|
||||||
|
|
||||||
|
leaq (, INCX, SIZE), INCX
|
||||||
|
leaq (, INCY, SIZE), INCY
|
||||||
|
|
||||||
|
testq M, M
|
||||||
|
jle .L47
|
||||||
|
|
||||||
|
cmpq $SIZE, INCX
|
||||||
|
jne .L40
|
||||||
|
cmpq $SIZE, INCY
|
||||||
|
jne .L40
|
||||||
|
|
||||||
|
testq $SIZE, Y
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
movsd (X), %xmm0
|
||||||
|
mulsd ALPHA, %xmm0
|
||||||
|
addsd (Y), %xmm0
|
||||||
|
movsd %xmm0, (Y)
|
||||||
|
addq $1 * SIZE, X
|
||||||
|
addq $1 * SIZE, Y
|
||||||
|
decq M
|
||||||
|
jle .L19
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
|
||||||
|
movq M, %rax
|
||||||
|
sarq $4, %rax
|
||||||
|
jle .L13
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vmovups -14 * SIZE(X), %xmm1
|
||||||
|
vmovups -12 * SIZE(X), %xmm2
|
||||||
|
vmovups -10 * SIZE(X), %xmm3
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jle .L12
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L11:
|
||||||
|
|
||||||
|
prefetchnta A_PRE(Y)
|
||||||
|
|
||||||
|
vmovups -8 * SIZE(X), %xmm4
|
||||||
|
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||||
|
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||||
|
vmovups -6 * SIZE(X), %xmm5
|
||||||
|
vmovups -4 * SIZE(X), %xmm6
|
||||||
|
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
|
||||||
|
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
|
||||||
|
vmovups -2 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
prefetchnta A_PRE(X)
|
||||||
|
nop
|
||||||
|
vmovups %xmm2, -12 * SIZE(Y)
|
||||||
|
vmovups %xmm3, -10 * SIZE(Y)
|
||||||
|
|
||||||
|
prefetchnta A_PRE+64(Y)
|
||||||
|
|
||||||
|
vmovups 0 * SIZE(X), %xmm0
|
||||||
|
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
|
||||||
|
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
|
||||||
|
vmovups 2 * SIZE(X), %xmm1
|
||||||
|
vmovups 4 * SIZE(X), %xmm2
|
||||||
|
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
|
||||||
|
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
|
||||||
|
vmovups 6 * SIZE(X), %xmm3
|
||||||
|
|
||||||
|
|
||||||
|
vmovups %xmm4, -8 * SIZE(Y)
|
||||||
|
vmovups %xmm5, -6 * SIZE(Y)
|
||||||
|
prefetchnta A_PRE+64(X)
|
||||||
|
nop
|
||||||
|
vmovups %xmm6, -4 * SIZE(Y)
|
||||||
|
vmovups %xmm7, -2 * SIZE(Y)
|
||||||
|
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
decq %rax
|
||||||
|
jg .L11
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L12:
|
||||||
|
|
||||||
|
vmovups -8 * SIZE(X), %xmm4
|
||||||
|
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||||
|
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||||
|
vmovups -6 * SIZE(X), %xmm5
|
||||||
|
vmovups -4 * SIZE(X), %xmm6
|
||||||
|
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
|
||||||
|
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
|
||||||
|
vmovups -2 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
vmovups %xmm2, -12 * SIZE(Y)
|
||||||
|
vmovups %xmm3, -10 * SIZE(Y)
|
||||||
|
|
||||||
|
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
|
||||||
|
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
|
||||||
|
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
|
||||||
|
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
|
||||||
|
|
||||||
|
vmovups %xmm4, -8 * SIZE(Y)
|
||||||
|
vmovups %xmm5, -6 * SIZE(Y)
|
||||||
|
vmovups %xmm6, -4 * SIZE(Y)
|
||||||
|
vmovups %xmm7, -2 * SIZE(Y)
|
||||||
|
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L13:
|
||||||
|
|
||||||
|
|
||||||
|
movq M, %rax
|
||||||
|
andq $8, %rax
|
||||||
|
jle .L14
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vmovups -14 * SIZE(X), %xmm1
|
||||||
|
vmovups -12 * SIZE(X), %xmm2
|
||||||
|
vmovups -10 * SIZE(X), %xmm3
|
||||||
|
|
||||||
|
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||||
|
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||||
|
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
|
||||||
|
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
vmovups %xmm2, -12 * SIZE(Y)
|
||||||
|
vmovups %xmm3, -10 * SIZE(Y)
|
||||||
|
|
||||||
|
addq $8 * SIZE, X
|
||||||
|
addq $8 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L14:
|
||||||
|
movq M, %rax
|
||||||
|
andq $4, %rax
|
||||||
|
jle .L15
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vmovups -14 * SIZE(X), %xmm1
|
||||||
|
|
||||||
|
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||||
|
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
|
||||||
|
addq $4 * SIZE, X
|
||||||
|
addq $4 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L15:
|
||||||
|
movq M, %rax
|
||||||
|
andq $2, %rax
|
||||||
|
jle .L16
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
|
||||||
|
addq $2 * SIZE, X
|
||||||
|
addq $2 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L16:
|
||||||
|
movq M, %rax
|
||||||
|
andq $1, %rax
|
||||||
|
jle .L19
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovsd -16 * SIZE(X), %xmm0
|
||||||
|
vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
|
||||||
|
|
||||||
|
vmovsd %xmm0, -16 * SIZE(Y)
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L19:
|
||||||
|
xorq %rax,%rax
|
||||||
|
|
||||||
|
RESTOREREGISTERS
|
||||||
|
|
||||||
|
ret
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
|
||||||
|
.L40:
|
||||||
|
movq Y, YY
|
||||||
|
movq M, %rax
|
||||||
|
//If incx==0 || incy==0, avoid unloop.
|
||||||
|
cmpq $0, INCX
|
||||||
|
je .L46
|
||||||
|
cmpq $0, INCY
|
||||||
|
je .L46
|
||||||
|
|
||||||
|
sarq $3, %rax
|
||||||
|
jle .L45
|
||||||
|
|
||||||
|
prefetchnta 512(X)
|
||||||
|
prefetchnta 512+64(X)
|
||||||
|
prefetchnta 512+128(X)
|
||||||
|
prefetchnta 512+192(X)
|
||||||
|
|
||||||
|
prefetchnta 512(Y)
|
||||||
|
prefetchnta 512+64(Y)
|
||||||
|
prefetchnta 512+128(Y)
|
||||||
|
prefetchnta 512+192(Y)
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L41:
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm0
|
||||||
|
addq INCX, X
|
||||||
|
vmovhpd 0 * SIZE(X), %xmm0 , %xmm0
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(YY), %xmm6
|
||||||
|
addq INCY, YY
|
||||||
|
vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6
|
||||||
|
addq INCY, YY
|
||||||
|
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm1
|
||||||
|
addq INCX, X
|
||||||
|
vmovhpd 0 * SIZE(X), %xmm1 , %xmm1
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(YY), %xmm7
|
||||||
|
addq INCY, YY
|
||||||
|
vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7
|
||||||
|
addq INCY, YY
|
||||||
|
|
||||||
|
vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm2
|
||||||
|
addq INCX, X
|
||||||
|
vmovhpd 0 * SIZE(X), %xmm2 , %xmm2
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(YY), %xmm8
|
||||||
|
addq INCY, YY
|
||||||
|
vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8
|
||||||
|
addq INCY, YY
|
||||||
|
|
||||||
|
vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm3
|
||||||
|
addq INCX, X
|
||||||
|
vmovhpd 0 * SIZE(X), %xmm3 , %xmm3
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(YY), %xmm9
|
||||||
|
addq INCY, YY
|
||||||
|
vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9
|
||||||
|
addq INCY, YY
|
||||||
|
|
||||||
|
|
||||||
|
vmovsd %xmm0, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovhpd %xmm0, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm1, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovhpd %xmm1, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm2, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovhpd %xmm2, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3
|
||||||
|
|
||||||
|
vmovsd %xmm3, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovhpd %xmm3, 0 * SIZE(Y)
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jg .L41
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L45:
|
||||||
|
movq M, %rax
|
||||||
|
andq $7, %rax
|
||||||
|
jle .L47
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L46:
|
||||||
|
vmovsd (X), %xmm0
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0
|
||||||
|
|
||||||
|
vmovsd %xmm0, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jg .L46
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L47:
|
||||||
|
xorq %rax, %rax
|
||||||
|
|
||||||
|
RESTOREREGISTERS
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,291 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define M ARG1 /* rdi */
|
||||||
|
#define X ARG2 /* rsi */
|
||||||
|
#define INCX ARG3 /* rdx */
|
||||||
|
#define Y ARG4 /* rcx */
|
||||||
|
#ifndef WINDOWS_ABI
|
||||||
|
#define INCY ARG5 /* r8 */
|
||||||
|
#else
|
||||||
|
#define INCY %r10
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "l1param.h"
|
||||||
|
|
||||||
|
#define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG
|
||||||
|
#define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2
|
||||||
|
#define A_PRE 640
|
||||||
|
#define B_PRE 640
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
movq 40(%rsp), INCY
|
||||||
|
#endif
|
||||||
|
|
||||||
|
SAVEREGISTERS
|
||||||
|
|
||||||
|
leaq (, INCX, SIZE), INCX
|
||||||
|
leaq (, INCY, SIZE), INCY
|
||||||
|
|
||||||
|
cmpq $SIZE, INCX
|
||||||
|
jne .L40
|
||||||
|
cmpq $SIZE, INCY
|
||||||
|
jne .L40
|
||||||
|
|
||||||
|
testq $SIZE, X
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
vmovsd (X), %xmm0
|
||||||
|
vmovsd %xmm0, (Y)
|
||||||
|
addq $1 * SIZE, X
|
||||||
|
addq $1 * SIZE, Y
|
||||||
|
decq M
|
||||||
|
jle .L19
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
|
||||||
|
|
||||||
|
movq M, %rax
|
||||||
|
sarq $4, %rax
|
||||||
|
jle .L13
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vmovups -14 * SIZE(X), %xmm1
|
||||||
|
vmovups -12 * SIZE(X), %xmm2
|
||||||
|
vmovups -10 * SIZE(X), %xmm3
|
||||||
|
vmovups -8 * SIZE(X), %xmm4
|
||||||
|
vmovups -6 * SIZE(X), %xmm5
|
||||||
|
vmovups -4 * SIZE(X), %xmm6
|
||||||
|
vmovups -2 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jle .L12
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L11:
|
||||||
|
|
||||||
|
prefetchnta A_PRE(X)
|
||||||
|
nop
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
prefetchnta B_PRE(Y)
|
||||||
|
nop
|
||||||
|
vmovups %xmm2, -12 * SIZE(Y)
|
||||||
|
vmovups %xmm3, -10 * SIZE(Y)
|
||||||
|
|
||||||
|
VLOAD( 0 * SIZE, X, %xmm0)
|
||||||
|
VLOAD( 2 * SIZE, X, %xmm1)
|
||||||
|
VLOAD( 4 * SIZE, X, %xmm2)
|
||||||
|
VLOAD( 6 * SIZE, X, %xmm3)
|
||||||
|
|
||||||
|
prefetchnta A_PRE+64(X)
|
||||||
|
nop
|
||||||
|
vmovups %xmm4, -8 * SIZE(Y)
|
||||||
|
vmovups %xmm5, -6 * SIZE(Y)
|
||||||
|
prefetchnta B_PRE+64(Y)
|
||||||
|
nop
|
||||||
|
vmovups %xmm6, -4 * SIZE(Y)
|
||||||
|
vmovups %xmm7, -2 * SIZE(Y)
|
||||||
|
|
||||||
|
VLOAD( 8 * SIZE, X, %xmm4)
|
||||||
|
VLOAD(10 * SIZE, X, %xmm5)
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
VLOAD(12 * SIZE, X, %xmm6)
|
||||||
|
VLOAD(14 * SIZE, X, %xmm7)
|
||||||
|
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
decq %rax
|
||||||
|
jg .L11
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L12:
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
vmovups %xmm2, -12 * SIZE(Y)
|
||||||
|
vmovups %xmm3, -10 * SIZE(Y)
|
||||||
|
vmovups %xmm4, -8 * SIZE(Y)
|
||||||
|
vmovups %xmm5, -6 * SIZE(Y)
|
||||||
|
vmovups %xmm6, -4 * SIZE(Y)
|
||||||
|
vmovups %xmm7, -2 * SIZE(Y)
|
||||||
|
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L13:
|
||||||
|
testq $8, M
|
||||||
|
jle .L14
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vmovups -14 * SIZE(X), %xmm1
|
||||||
|
vmovups -12 * SIZE(X), %xmm2
|
||||||
|
vmovups -10 * SIZE(X), %xmm3
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
vmovups %xmm2, -12 * SIZE(Y)
|
||||||
|
vmovups %xmm3, -10 * SIZE(Y)
|
||||||
|
|
||||||
|
addq $8 * SIZE, X
|
||||||
|
addq $8 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L14:
|
||||||
|
testq $4, M
|
||||||
|
jle .L15
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vmovups -14 * SIZE(X), %xmm1
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
vmovups %xmm1, -14 * SIZE(Y)
|
||||||
|
|
||||||
|
addq $4 * SIZE, X
|
||||||
|
addq $4 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L15:
|
||||||
|
testq $2, M
|
||||||
|
jle .L16
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm0
|
||||||
|
vmovups %xmm0, -16 * SIZE(Y)
|
||||||
|
|
||||||
|
addq $2 * SIZE, X
|
||||||
|
addq $2 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L16:
|
||||||
|
testq $1, M
|
||||||
|
jle .L19
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
vmovsd -16 * SIZE(X), %xmm0
|
||||||
|
vmovsd %xmm0, -16 * SIZE(Y)
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L19:
|
||||||
|
xorq %rax,%rax
|
||||||
|
|
||||||
|
RESTOREREGISTERS
|
||||||
|
|
||||||
|
ret
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.L40:
|
||||||
|
movq M, %rax
|
||||||
|
sarq $3, %rax
|
||||||
|
jle .L45
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L41:
|
||||||
|
vmovsd (X), %xmm0
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd (X), %xmm4
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd (X), %xmm1
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd (X), %xmm5
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd (X), %xmm2
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd (X), %xmm6
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd (X), %xmm3
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd (X), %xmm7
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
vmovsd %xmm0, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm4, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm1, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm5, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm2, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm6, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm3, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd %xmm7, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jg .L41
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L45:
|
||||||
|
movq M, %rax
|
||||||
|
andq $7, %rax
|
||||||
|
jle .L47
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L46:
|
||||||
|
vmovsd (X), %xmm0
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd %xmm0, (Y)
|
||||||
|
addq INCY, Y
|
||||||
|
decq %rax
|
||||||
|
jg .L46
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L47:
|
||||||
|
xorq %rax, %rax
|
||||||
|
|
||||||
|
RESTOREREGISTERS
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,311 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N ARG1 /* rdi */
|
||||||
|
#define X ARG2 /* rsi */
|
||||||
|
#define INCX ARG3 /* rdx */
|
||||||
|
#define Y ARG4 /* rcx */
|
||||||
|
#ifndef WINDOWS_ABI
|
||||||
|
#define INCY ARG5 /* r8 */
|
||||||
|
#else
|
||||||
|
#define INCY %r10
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define A_PRE 512
|
||||||
|
|
||||||
|
#include "l1param.h"
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
movq 40(%rsp), INCY
|
||||||
|
#endif
|
||||||
|
|
||||||
|
SAVEREGISTERS
|
||||||
|
|
||||||
|
leaq (, INCX, SIZE), INCX
|
||||||
|
leaq (, INCY, SIZE), INCY
|
||||||
|
|
||||||
|
vxorps %xmm0, %xmm0 , %xmm0
|
||||||
|
vxorps %xmm1, %xmm1 , %xmm1
|
||||||
|
vxorps %xmm2, %xmm2 , %xmm2
|
||||||
|
vxorps %xmm3, %xmm3 , %xmm3
|
||||||
|
|
||||||
|
cmpq $0, N
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
cmpq $SIZE, INCX
|
||||||
|
jne .L50
|
||||||
|
cmpq $SIZE, INCY
|
||||||
|
jne .L50
|
||||||
|
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
|
||||||
|
testq $SIZE, Y
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
vmovsd -16 * SIZE(X), %xmm0
|
||||||
|
vmulsd -16 * SIZE(Y), %xmm0 , %xmm0
|
||||||
|
addq $1 * SIZE, X
|
||||||
|
addq $1 * SIZE, Y
|
||||||
|
decq N
|
||||||
|
ALIGN_2
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
|
||||||
|
movq N, %rax
|
||||||
|
sarq $4, %rax
|
||||||
|
jle .L14
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm4
|
||||||
|
vmovups -14 * SIZE(X), %xmm5
|
||||||
|
vmovups -12 * SIZE(X), %xmm6
|
||||||
|
vmovups -10 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
vmovups -8 * SIZE(X), %xmm8
|
||||||
|
vmovups -6 * SIZE(X), %xmm9
|
||||||
|
vmovups -4 * SIZE(X), %xmm10
|
||||||
|
vmovups -2 * SIZE(X), %xmm11
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jle .L12
|
||||||
|
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L11:
|
||||||
|
prefetchnta A_PRE(Y)
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||||
|
prefetchnta A_PRE(X)
|
||||||
|
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
|
||||||
|
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
|
||||||
|
|
||||||
|
vmovups 0 * SIZE(X), %xmm4
|
||||||
|
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
|
||||||
|
vmovups 2 * SIZE(X), %xmm5
|
||||||
|
vmovups 4 * SIZE(X), %xmm6
|
||||||
|
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
|
||||||
|
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
|
||||||
|
vmovups 6 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
prefetchnta A_PRE+64(Y)
|
||||||
|
|
||||||
|
vmovups 8 * SIZE(X), %xmm8
|
||||||
|
vmovups 10 * SIZE(X), %xmm9
|
||||||
|
prefetchnta A_PRE+64(X)
|
||||||
|
vmovups 12 * SIZE(X), %xmm10
|
||||||
|
vmovups 14 * SIZE(X), %xmm11
|
||||||
|
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jg .L11
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L12:
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||||
|
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
|
||||||
|
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
|
||||||
|
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
|
||||||
|
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
|
||||||
|
|
||||||
|
subq $-16 * SIZE, X
|
||||||
|
subq $-16 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L14:
|
||||||
|
testq $15, N
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
testq $8, N
|
||||||
|
jle .L15
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm4
|
||||||
|
vmovups -14 * SIZE(X), %xmm5
|
||||||
|
vmovups -12 * SIZE(X), %xmm6
|
||||||
|
vmovups -10 * SIZE(X), %xmm7
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||||
|
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
|
||||||
|
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
|
||||||
|
|
||||||
|
addq $8 * SIZE, X
|
||||||
|
addq $8 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L15:
|
||||||
|
testq $4, N
|
||||||
|
jle .L16
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm4
|
||||||
|
vmovups -14 * SIZE(X), %xmm5
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
|
||||||
|
|
||||||
|
addq $4 * SIZE, X
|
||||||
|
addq $4 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L16:
|
||||||
|
testq $2, N
|
||||||
|
jle .L17
|
||||||
|
|
||||||
|
vmovups -16 * SIZE(X), %xmm4
|
||||||
|
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
|
||||||
|
|
||||||
|
|
||||||
|
addq $2 * SIZE, X
|
||||||
|
addq $2 * SIZE, Y
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L17:
|
||||||
|
testq $1, N
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
vmovsd -16 * SIZE(X), %xmm4
|
||||||
|
vmovsd -16 * SIZE(Y), %xmm5
|
||||||
|
vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0
|
||||||
|
jmp .L999
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
|
||||||
|
.L50:
|
||||||
|
movq N, %rax
|
||||||
|
sarq $3, %rax
|
||||||
|
jle .L55
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L53:
|
||||||
|
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm4
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm8
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd 0 * SIZE(X), %xmm5
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm9
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm6
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm10
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd 0 * SIZE(X), %xmm7
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm11
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
|
||||||
|
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
|
||||||
|
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
|
||||||
|
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm4
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm8
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd 0 * SIZE(X), %xmm5
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm9
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(X), %xmm6
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm10
|
||||||
|
addq INCY, Y
|
||||||
|
vmovsd 0 * SIZE(X), %xmm7
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm11
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
|
||||||
|
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
|
||||||
|
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
|
||||||
|
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jg .L53
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L55:
|
||||||
|
movq N, %rax
|
||||||
|
andq $7, %rax
|
||||||
|
jle .L999
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L56:
|
||||||
|
vmovsd 0 * SIZE(X), %xmm4
|
||||||
|
addq INCX, X
|
||||||
|
vmovsd 0 * SIZE(Y), %xmm8
|
||||||
|
addq INCY, Y
|
||||||
|
|
||||||
|
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
|
||||||
|
|
||||||
|
decq %rax
|
||||||
|
jg .L56
|
||||||
|
ALIGN_3
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
vaddpd %xmm1, %xmm0 , %xmm0
|
||||||
|
vaddpd %xmm3, %xmm2 , %xmm2
|
||||||
|
vaddpd %xmm2, %xmm0 , %xmm0
|
||||||
|
|
||||||
|
vhaddpd %xmm0, %xmm0 , %xmm0
|
||||||
|
|
||||||
|
RESTOREREGISTERS
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,667 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
|
||||||
|
#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS
|
||||||
|
|
||||||
|
#define A_PRE 256
|
||||||
|
|
||||||
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
|
#define N ARG1 /* rsi */
|
||||||
|
#define M ARG2 /* rdi */
|
||||||
|
#define A ARG3 /* rdx */
|
||||||
|
#define LDA ARG4 /* rcx */
|
||||||
|
#define B ARG5 /* r8 */
|
||||||
|
|
||||||
|
#define AO1 %r9
|
||||||
|
#define AO2 %r10
|
||||||
|
#define LDA3 %r11
|
||||||
|
#define M8 %r12
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define N ARG1 /* rdx */
|
||||||
|
#define M ARG2 /* rcx */
|
||||||
|
#define A ARG3 /* r8 */
|
||||||
|
#define LDA ARG4 /* r9 */
|
||||||
|
#define OLD_B 40 + 56(%rsp)
|
||||||
|
|
||||||
|
#define B %r12
|
||||||
|
|
||||||
|
#define AO1 %rsi
|
||||||
|
#define AO2 %rdi
|
||||||
|
#define LDA3 %r10
|
||||||
|
#define M8 %r11
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define I %rax
|
||||||
|
|
||||||
|
#define B0 %rbp
|
||||||
|
#define B1 %r13
|
||||||
|
#define B2 %r14
|
||||||
|
#define B3 %r15
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
pushq %rdi
|
||||||
|
pushq %rsi
|
||||||
|
#endif
|
||||||
|
|
||||||
|
pushq %r15
|
||||||
|
pushq %r14
|
||||||
|
pushq %r13
|
||||||
|
pushq %r12
|
||||||
|
pushq %rbp
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
movq OLD_B, B
|
||||||
|
#endif
|
||||||
|
|
||||||
|
subq $-16 * SIZE, B
|
||||||
|
|
||||||
|
movq M, B1
|
||||||
|
movq M, B2
|
||||||
|
movq M, B3
|
||||||
|
|
||||||
|
andq $-8, B1
|
||||||
|
andq $-4, B2
|
||||||
|
andq $-2, B3
|
||||||
|
|
||||||
|
imulq N, B1
|
||||||
|
imulq N, B2
|
||||||
|
imulq N, B3
|
||||||
|
|
||||||
|
leaq (B, B1, SIZE), B1
|
||||||
|
leaq (B, B2, SIZE), B2
|
||||||
|
leaq (B, B3, SIZE), B3
|
||||||
|
|
||||||
|
leaq (,LDA, SIZE), LDA
|
||||||
|
leaq (LDA, LDA, 2), LDA3
|
||||||
|
|
||||||
|
leaq (, N, SIZE), M8
|
||||||
|
|
||||||
|
cmpq $8, N
|
||||||
|
jl .L20
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L11:
|
||||||
|
subq $8, N
|
||||||
|
|
||||||
|
movq A, AO1
|
||||||
|
leaq (A, LDA, 4), AO2
|
||||||
|
leaq (A, LDA, 8), A
|
||||||
|
|
||||||
|
movq B, B0
|
||||||
|
addq $64 * SIZE, B
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L14
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L13:
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO1)
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||||
|
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B0)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B0)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B0)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B0)
|
||||||
|
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO1, LDA, 1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -8 * SIZE(B0)
|
||||||
|
vmovups %xmm1, -6 * SIZE(B0)
|
||||||
|
vmovups %xmm2, -4 * SIZE(B0)
|
||||||
|
vmovups %xmm3, -2 * SIZE(B0)
|
||||||
|
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO1, LDA, 2)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3)
|
||||||
|
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 2 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 4 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 6 * SIZE(B0)
|
||||||
|
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO1, LDA3, 1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 8 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 10 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 12 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 14 * SIZE(B0)
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO2)
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||||
|
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
|
||||||
|
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 16 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 18 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 20 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 22 * SIZE(B0)
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO2, LDA, 1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 24 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 26 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 28 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 30 * SIZE(B0)
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO2, LDA, 2)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 32 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 34 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 36 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 38 * SIZE(B0)
|
||||||
|
|
||||||
|
prefetchnta A_PRE(AO2, LDA3, 1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 40 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 42 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 44 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 46 * SIZE(B0)
|
||||||
|
|
||||||
|
addq $8 * SIZE, AO1
|
||||||
|
addq $8 * SIZE, AO2
|
||||||
|
leaq (B0, M8, 8), B0
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L13
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L14:
|
||||||
|
testq $4, M
|
||||||
|
jle .L16
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B1)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B1)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B1)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B1)
|
||||||
|
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -8 * SIZE(B1)
|
||||||
|
vmovups %xmm1, -6 * SIZE(B1)
|
||||||
|
vmovups %xmm2, -4 * SIZE(B1)
|
||||||
|
vmovups %xmm3, -2 * SIZE(B1)
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(B1)
|
||||||
|
vmovups %xmm1, 2 * SIZE(B1)
|
||||||
|
vmovups %xmm2, 4 * SIZE(B1)
|
||||||
|
vmovups %xmm3, 6 * SIZE(B1)
|
||||||
|
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 8 * SIZE(B1)
|
||||||
|
vmovups %xmm1, 10 * SIZE(B1)
|
||||||
|
vmovups %xmm2, 12 * SIZE(B1)
|
||||||
|
vmovups %xmm3, 14 * SIZE(B1)
|
||||||
|
|
||||||
|
addq $4 * SIZE, AO1
|
||||||
|
addq $4 * SIZE, AO2
|
||||||
|
subq $-32 * SIZE, B1
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L16:
|
||||||
|
testq $2, M
|
||||||
|
jle .L18
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B2)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B2)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B2)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B2)
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -8 * SIZE(B2)
|
||||||
|
vmovups %xmm1, -6 * SIZE(B2)
|
||||||
|
vmovups %xmm2, -4 * SIZE(B2)
|
||||||
|
vmovups %xmm3, -2 * SIZE(B2)
|
||||||
|
|
||||||
|
addq $2 * SIZE, AO1
|
||||||
|
addq $2 * SIZE, AO2
|
||||||
|
subq $-16 * SIZE, B2
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L18:
|
||||||
|
testq $1, M
|
||||||
|
jle .L19
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 0 * SIZE(AO1, LDA), %xmm1
|
||||||
|
vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2
|
||||||
|
vmovsd 0 * SIZE(AO1, LDA3), %xmm3
|
||||||
|
|
||||||
|
vunpcklpd %xmm1, %xmm0 , %xmm0
|
||||||
|
vunpcklpd %xmm3, %xmm2 , %xmm2
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B3)
|
||||||
|
vmovups %xmm2, -14 * SIZE(B3)
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO2), %xmm0
|
||||||
|
vmovsd 0 * SIZE(AO2, LDA), %xmm1
|
||||||
|
vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2
|
||||||
|
vmovsd 0 * SIZE(AO2, LDA3), %xmm3
|
||||||
|
|
||||||
|
vunpcklpd %xmm1, %xmm0 , %xmm0
|
||||||
|
vunpcklpd %xmm3, %xmm2 , %xmm2
|
||||||
|
|
||||||
|
vmovups %xmm0, -12 * SIZE(B3)
|
||||||
|
vmovups %xmm2, -10 * SIZE(B3)
|
||||||
|
|
||||||
|
subq $-8 * SIZE, B3
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L19:
|
||||||
|
cmpq $8, N
|
||||||
|
jge .L11
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
cmpq $4, N
|
||||||
|
jl .L30
|
||||||
|
|
||||||
|
subq $4, N
|
||||||
|
|
||||||
|
movq A, AO1
|
||||||
|
leaq (A, LDA, 2), AO2
|
||||||
|
leaq (A, LDA, 4), A
|
||||||
|
|
||||||
|
movq B, B0
|
||||||
|
addq $32 * SIZE, B
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L24
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L23:
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||||
|
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B0)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B0)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B0)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B0)
|
||||||
|
|
||||||
|
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -8 * SIZE(B0)
|
||||||
|
vmovups %xmm1, -6 * SIZE(B0)
|
||||||
|
vmovups %xmm2, -4 * SIZE(B0)
|
||||||
|
vmovups %xmm3, -2 * SIZE(B0)
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||||
|
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
|
||||||
|
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 2 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 4 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 6 * SIZE(B0)
|
||||||
|
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1)
|
||||||
|
VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, 8 * SIZE(B0)
|
||||||
|
vmovups %xmm1, 10 * SIZE(B0)
|
||||||
|
vmovups %xmm2, 12 * SIZE(B0)
|
||||||
|
vmovups %xmm3, 14 * SIZE(B0)
|
||||||
|
|
||||||
|
addq $8 * SIZE, AO1
|
||||||
|
addq $8 * SIZE, AO2
|
||||||
|
leaq (B0, M8, 8), B0
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L23
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L24:
|
||||||
|
testq $4, M
|
||||||
|
jle .L26
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B1)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B1)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B1)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B1)
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2)
|
||||||
|
VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -8 * SIZE(B1)
|
||||||
|
vmovups %xmm1, -6 * SIZE(B1)
|
||||||
|
vmovups %xmm2, -4 * SIZE(B1)
|
||||||
|
vmovups %xmm3, -2 * SIZE(B1)
|
||||||
|
|
||||||
|
addq $4 * SIZE, AO1
|
||||||
|
addq $4 * SIZE, AO2
|
||||||
|
subq $-16 * SIZE, B1
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L26:
|
||||||
|
testq $2, M
|
||||||
|
jle .L28
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1)
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
|
||||||
|
VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B2)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B2)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B2)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B2)
|
||||||
|
|
||||||
|
addq $2 * SIZE, AO1
|
||||||
|
addq $2 * SIZE, AO2
|
||||||
|
subq $-8 * SIZE, B2
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L28:
|
||||||
|
testq $1, M
|
||||||
|
jle .L30
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 0 * SIZE(AO1, LDA), %xmm1
|
||||||
|
vmovsd 0 * SIZE(AO2), %xmm2
|
||||||
|
vmovsd 0 * SIZE(AO2, LDA), %xmm3
|
||||||
|
|
||||||
|
vunpcklpd %xmm1, %xmm0, %xmm0
|
||||||
|
vunpcklpd %xmm3, %xmm2, %xmm2
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B3)
|
||||||
|
vmovups %xmm2, -14 * SIZE(B3)
|
||||||
|
subq $-4 * SIZE, B3
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L30:
|
||||||
|
cmpq $2, N
|
||||||
|
jl .L40
|
||||||
|
|
||||||
|
subq $2, N
|
||||||
|
|
||||||
|
movq A, AO1
|
||||||
|
leaq (A, LDA), AO2
|
||||||
|
leaq (A, LDA, 2), A
|
||||||
|
|
||||||
|
movq B, B0
|
||||||
|
addq $16 * SIZE, B
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L34
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L33:
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||||
|
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B0)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B0)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B0)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B0)
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO2, %xmm1)
|
||||||
|
VMOVUPS_A1(4 * SIZE, AO2, %xmm2)
|
||||||
|
VMOVUPS_A1(6 * SIZE, AO2, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -8 * SIZE(B0)
|
||||||
|
vmovups %xmm1, -6 * SIZE(B0)
|
||||||
|
vmovups %xmm2, -4 * SIZE(B0)
|
||||||
|
vmovups %xmm3, -2 * SIZE(B0)
|
||||||
|
|
||||||
|
addq $8 * SIZE, AO1
|
||||||
|
addq $8 * SIZE, AO2
|
||||||
|
leaq (B0, M8, 8), B0
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L33
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L34:
|
||||||
|
testq $4, M
|
||||||
|
jle .L36
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm2)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO2, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B1)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B1)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B1)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B1)
|
||||||
|
|
||||||
|
addq $4 * SIZE, AO1
|
||||||
|
addq $4 * SIZE, AO2
|
||||||
|
subq $-8 * SIZE, B1
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L36:
|
||||||
|
testq $2, M
|
||||||
|
jle .L38
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO2, %xmm1)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B2)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B2)
|
||||||
|
|
||||||
|
addq $2 * SIZE, AO1
|
||||||
|
addq $2 * SIZE, AO2
|
||||||
|
subq $-4 * SIZE, B2
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L38:
|
||||||
|
testq $1, M
|
||||||
|
jle .L40
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 0 * SIZE(AO2), %xmm1
|
||||||
|
|
||||||
|
vunpcklpd %xmm1, %xmm0, %xmm0
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B3)
|
||||||
|
subq $-2 * SIZE, B3
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L40:
|
||||||
|
cmpq $1, N
|
||||||
|
jl .L999
|
||||||
|
|
||||||
|
movq A, AO1
|
||||||
|
|
||||||
|
movq B, B0
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L44
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L43:
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
VMOVUPS_A1(4 * SIZE, AO1, %xmm2)
|
||||||
|
VMOVUPS_A1(6 * SIZE, AO1, %xmm3)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B0)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B0)
|
||||||
|
vmovups %xmm2, -12 * SIZE(B0)
|
||||||
|
vmovups %xmm3, -10 * SIZE(B0)
|
||||||
|
|
||||||
|
addq $8 * SIZE, AO1
|
||||||
|
leaq (B0, M8, 8), B0
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L43
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L44:
|
||||||
|
testq $4, M
|
||||||
|
jle .L45
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
VMOVUPS_A1(2 * SIZE, AO1, %xmm1)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B1)
|
||||||
|
vmovups %xmm1, -14 * SIZE(B1)
|
||||||
|
|
||||||
|
addq $4 * SIZE, AO1
|
||||||
|
subq $-4 * SIZE, B1
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L45:
|
||||||
|
testq $2, M
|
||||||
|
jle .L46
|
||||||
|
|
||||||
|
VMOVUPS_A1(0 * SIZE, AO1, %xmm0)
|
||||||
|
|
||||||
|
vmovups %xmm0, -16 * SIZE(B2)
|
||||||
|
|
||||||
|
addq $2 * SIZE, AO1
|
||||||
|
subq $-2 * SIZE, B2
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L46:
|
||||||
|
testq $1, M
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
|
||||||
|
vmovsd %xmm0, -16 * SIZE(B3)
|
||||||
|
jmp .L999
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
popq %rbp
|
||||||
|
popq %r12
|
||||||
|
popq %r13
|
||||||
|
popq %r14
|
||||||
|
popq %r15
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
popq %rsi
|
||||||
|
popq %rdi
|
||||||
|
#endif
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -47,7 +47,7 @@
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define STACKSIZE 64
|
#define STACKSIZE 128
|
||||||
|
|
||||||
#define OLD_M %rdi
|
#define OLD_M %rdi
|
||||||
#define OLD_N %rsi
|
#define OLD_N %rsi
|
||||||
|
@ -59,9 +59,14 @@
|
||||||
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 48 (%rsp)
|
#define ALPHA 48 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 56(%rsp)
|
||||||
|
#define NN 64(%rsp)
|
||||||
|
#define AA 72(%rsp)
|
||||||
|
#define LDAX 80(%rsp)
|
||||||
|
#define XX 88(%rsp)
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define STACKSIZE 256
|
#define STACKSIZE 288
|
||||||
|
|
||||||
#define OLD_M %rcx
|
#define OLD_M %rcx
|
||||||
#define OLD_N %rdx
|
#define OLD_N %rdx
|
||||||
|
@ -74,6 +79,12 @@
|
||||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 224 (%rsp)
|
#define ALPHA 224 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 232(%rsp)
|
||||||
|
#define NN 240(%rsp)
|
||||||
|
#define AA 248(%rsp)
|
||||||
|
#define LDAX 256(%rsp)
|
||||||
|
#define XX 264(%rsp)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LDA %r8
|
#define LDA %r8
|
||||||
|
@ -137,17 +148,42 @@
|
||||||
movq OLD_LDA, LDA
|
movq OLD_LDA, LDA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
movq STACK_INCX, INCX
|
|
||||||
movq STACK_Y, Y
|
|
||||||
movq STACK_INCY, INCY
|
|
||||||
movq STACK_BUFFER, BUFFER
|
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
movsd %xmm0, ALPHA
|
movsd %xmm0, ALPHA
|
||||||
#else
|
#else
|
||||||
movsd %xmm3, ALPHA
|
movsd %xmm3, ALPHA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
movq STACK_Y, Y
|
||||||
|
movq A,AA
|
||||||
|
movq N,NN
|
||||||
|
movq M,MMM
|
||||||
|
movq LDA,LDAX
|
||||||
|
movq X,XX
|
||||||
|
|
||||||
|
.L0t:
|
||||||
|
xorq I,I
|
||||||
|
addq $1,I
|
||||||
|
salq $21,I
|
||||||
|
subq I,MMM
|
||||||
|
movq I,M
|
||||||
|
jge .L00t
|
||||||
|
|
||||||
|
movq MMM,M
|
||||||
|
addq I,M
|
||||||
|
jle .L999x
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movq XX,X
|
||||||
|
movq AA,A
|
||||||
|
movq NN,N
|
||||||
|
movq LDAX,LDA
|
||||||
|
|
||||||
|
movq STACK_INCX, INCX
|
||||||
|
movq STACK_INCY, INCY
|
||||||
|
movq STACK_BUFFER, BUFFER
|
||||||
|
|
||||||
|
|
||||||
leaq -1(INCY), %rax
|
leaq -1(INCY), %rax
|
||||||
|
|
||||||
leaq (,INCX, SIZE), INCX
|
leaq (,INCX, SIZE), INCX
|
||||||
|
@ -2815,6 +2851,12 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
leaq (, M, SIZE), %rax
|
||||||
|
addq %rax,AA
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
movq 0(%rsp), %rbx
|
movq 0(%rsp), %rbx
|
||||||
movq 8(%rsp), %rbp
|
movq 8(%rsp), %rbp
|
||||||
movq 16(%rsp), %r12
|
movq 16(%rsp), %r12
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,360 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
|
#define M ARG1 /* rdi */
|
||||||
|
#define N ARG2 /* rsi */
|
||||||
|
#define A ARG3 /* rdx */
|
||||||
|
#define LDA ARG4 /* rcx */
|
||||||
|
#define B ARG5 /* r8 */
|
||||||
|
|
||||||
|
#define I %r9
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define STACKSIZE 256
|
||||||
|
|
||||||
|
#define M ARG1 /* rcx */
|
||||||
|
#define N ARG2 /* rdx */
|
||||||
|
#define A ARG3 /* r8 */
|
||||||
|
#define LDA ARG4 /* r9 */
|
||||||
|
#define OLD_B 40 + 32 + STACKSIZE(%rsp)
|
||||||
|
|
||||||
|
#define B %r14
|
||||||
|
#define I %r15
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define J %r10
|
||||||
|
#define AO1 %r11
|
||||||
|
#define AO2 %r12
|
||||||
|
#define AO3 %r13
|
||||||
|
#define AO4 %rax
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
pushq %r15
|
||||||
|
pushq %r14
|
||||||
|
#endif
|
||||||
|
pushq %r13
|
||||||
|
pushq %r12
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
subq $STACKSIZE, %rsp
|
||||||
|
|
||||||
|
vmovups %xmm6, 0(%rsp)
|
||||||
|
vmovups %xmm7, 16(%rsp)
|
||||||
|
vmovups %xmm8, 32(%rsp)
|
||||||
|
vmovups %xmm9, 48(%rsp)
|
||||||
|
vmovups %xmm10, 64(%rsp)
|
||||||
|
vmovups %xmm11, 80(%rsp)
|
||||||
|
vmovups %xmm12, 96(%rsp)
|
||||||
|
vmovups %xmm13, 112(%rsp)
|
||||||
|
vmovups %xmm14, 128(%rsp)
|
||||||
|
vmovups %xmm15, 144(%rsp)
|
||||||
|
|
||||||
|
movq OLD_B, B
|
||||||
|
#endif
|
||||||
|
|
||||||
|
leaq (,LDA, SIZE), LDA # Scaling
|
||||||
|
|
||||||
|
movq N, J
|
||||||
|
sarq $1, J
|
||||||
|
jle .L20
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L01:
|
||||||
|
movq A, AO1
|
||||||
|
leaq (A, LDA), AO2
|
||||||
|
leaq (A, LDA, 2), A
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L08
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L03:
|
||||||
|
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovss 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovss 0 * SIZE(AO2), %xmm1
|
||||||
|
vmovss 1 * SIZE(AO1), %xmm2
|
||||||
|
vmovss 1 * SIZE(AO2), %xmm3
|
||||||
|
vmovss 2 * SIZE(AO1), %xmm4
|
||||||
|
vmovss 2 * SIZE(AO2), %xmm5
|
||||||
|
vmovss 3 * SIZE(AO1), %xmm6
|
||||||
|
vmovss 3 * SIZE(AO2), %xmm7
|
||||||
|
|
||||||
|
vmovss 4 * SIZE(AO1), %xmm8
|
||||||
|
vmovss 4 * SIZE(AO2), %xmm9
|
||||||
|
vmovss 5 * SIZE(AO1), %xmm10
|
||||||
|
vmovss 5 * SIZE(AO2), %xmm11
|
||||||
|
vmovss 6 * SIZE(AO1), %xmm12
|
||||||
|
vmovss 6 * SIZE(AO2), %xmm13
|
||||||
|
vmovss 7 * SIZE(AO1), %xmm14
|
||||||
|
vmovss 7 * SIZE(AO2), %xmm15
|
||||||
|
|
||||||
|
vmovss %xmm0, 0 * SIZE(B)
|
||||||
|
vmovss %xmm1, 1 * SIZE(B)
|
||||||
|
vmovss %xmm2, 2 * SIZE(B)
|
||||||
|
vmovss %xmm3, 3 * SIZE(B)
|
||||||
|
vmovss %xmm4, 4 * SIZE(B)
|
||||||
|
vmovss %xmm5, 5 * SIZE(B)
|
||||||
|
vmovss %xmm6, 6 * SIZE(B)
|
||||||
|
vmovss %xmm7, 7 * SIZE(B)
|
||||||
|
|
||||||
|
vmovss %xmm8, 8 * SIZE(B)
|
||||||
|
vmovss %xmm9, 9 * SIZE(B)
|
||||||
|
vmovss %xmm10, 10 * SIZE(B)
|
||||||
|
vmovss %xmm11, 11 * SIZE(B)
|
||||||
|
vmovss %xmm12, 12 * SIZE(B)
|
||||||
|
vmovss %xmm13, 13 * SIZE(B)
|
||||||
|
vmovss %xmm14, 14 * SIZE(B)
|
||||||
|
vmovss %xmm15, 15 * SIZE(B)
|
||||||
|
|
||||||
|
#else
|
||||||
|
prefetchw 256(B)
|
||||||
|
|
||||||
|
prefetchnta 256(AO1)
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 1 * SIZE(AO1), %xmm1
|
||||||
|
vmovsd 2 * SIZE(AO1), %xmm2
|
||||||
|
vmovsd 3 * SIZE(AO1), %xmm3
|
||||||
|
vmovsd 4 * SIZE(AO1), %xmm4
|
||||||
|
vmovsd 5 * SIZE(AO1), %xmm5
|
||||||
|
vmovsd 6 * SIZE(AO1), %xmm6
|
||||||
|
vmovsd 7 * SIZE(AO1), %xmm7
|
||||||
|
|
||||||
|
prefetchnta 256(AO2)
|
||||||
|
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||||
|
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
|
||||||
|
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
|
||||||
|
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
|
||||||
|
vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4
|
||||||
|
vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5
|
||||||
|
vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6
|
||||||
|
vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7
|
||||||
|
|
||||||
|
|
||||||
|
prefetchw 256+64(B)
|
||||||
|
vmovups %xmm0, 0 * SIZE(B)
|
||||||
|
vmovups %xmm1, 2 * SIZE(B)
|
||||||
|
vmovups %xmm2, 4 * SIZE(B)
|
||||||
|
vmovups %xmm3, 6 * SIZE(B)
|
||||||
|
vmovups %xmm4, 8 * SIZE(B)
|
||||||
|
vmovups %xmm5, 10 * SIZE(B)
|
||||||
|
vmovups %xmm6, 12 * SIZE(B)
|
||||||
|
vmovups %xmm7, 14 * SIZE(B)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $8 * SIZE, AO1
|
||||||
|
addq $8 * SIZE, AO2
|
||||||
|
subq $-16 * SIZE, B
|
||||||
|
decq I
|
||||||
|
jg .L03
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L08:
|
||||||
|
testq $4 , M
|
||||||
|
je .L14
|
||||||
|
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L13:
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovss 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovss 0 * SIZE(AO2), %xmm1
|
||||||
|
vmovss 1 * SIZE(AO1), %xmm2
|
||||||
|
vmovss 1 * SIZE(AO2), %xmm3
|
||||||
|
vmovss 2 * SIZE(AO1), %xmm4
|
||||||
|
vmovss 2 * SIZE(AO2), %xmm5
|
||||||
|
vmovss 3 * SIZE(AO1), %xmm6
|
||||||
|
vmovss 3 * SIZE(AO2), %xmm7
|
||||||
|
|
||||||
|
vmovss %xmm0, 0 * SIZE(B)
|
||||||
|
vmovss %xmm1, 1 * SIZE(B)
|
||||||
|
vmovss %xmm2, 2 * SIZE(B)
|
||||||
|
vmovss %xmm3, 3 * SIZE(B)
|
||||||
|
vmovss %xmm4, 4 * SIZE(B)
|
||||||
|
vmovss %xmm5, 5 * SIZE(B)
|
||||||
|
vmovss %xmm6, 6 * SIZE(B)
|
||||||
|
vmovss %xmm7, 7 * SIZE(B)
|
||||||
|
#else
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 1 * SIZE(AO1), %xmm1
|
||||||
|
vmovsd 2 * SIZE(AO1), %xmm2
|
||||||
|
vmovsd 3 * SIZE(AO1), %xmm3
|
||||||
|
|
||||||
|
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||||
|
vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1
|
||||||
|
vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2
|
||||||
|
vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3
|
||||||
|
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(B)
|
||||||
|
vmovups %xmm1, 2 * SIZE(B)
|
||||||
|
vmovups %xmm2, 4 * SIZE(B)
|
||||||
|
vmovups %xmm3, 6 * SIZE(B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $4 * SIZE, AO1
|
||||||
|
addq $4 * SIZE, AO2
|
||||||
|
subq $-8 * SIZE, B
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L14:
|
||||||
|
movq M, I
|
||||||
|
andq $3, I
|
||||||
|
jle .L16
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L15:
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovss 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovss 0 * SIZE(AO2), %xmm1
|
||||||
|
|
||||||
|
vmovss %xmm0, 0 * SIZE(B)
|
||||||
|
vmovss %xmm1, 1 * SIZE(B)
|
||||||
|
#else
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $SIZE, AO1
|
||||||
|
addq $SIZE, AO2
|
||||||
|
addq $2 * SIZE, B
|
||||||
|
decq I
|
||||||
|
jg .L15
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L16:
|
||||||
|
decq J
|
||||||
|
jg .L01
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
testq $1, N
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
movq A, AO1
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $2, I
|
||||||
|
jle .L34
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L33:
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovups 0 * SIZE(AO1), %xmm0
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(B)
|
||||||
|
#else
|
||||||
|
vmovups 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovups 2 * SIZE(AO1), %xmm1
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(B)
|
||||||
|
vmovups %xmm1, 2 * SIZE(B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $4 * SIZE, AO1
|
||||||
|
subq $-4 * SIZE, B
|
||||||
|
decq I
|
||||||
|
jg .L33
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L34:
|
||||||
|
movq M, I
|
||||||
|
andq $3, I
|
||||||
|
jle .L999
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L35:
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovss 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovss %xmm0, 0 * SIZE(B)
|
||||||
|
#else
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd %xmm0, 0 * SIZE(B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $SIZE, AO1
|
||||||
|
addq $1 * SIZE, B
|
||||||
|
decq I
|
||||||
|
jg .L35
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
vmovups 0(%rsp), %xmm6
|
||||||
|
vmovups 16(%rsp), %xmm7
|
||||||
|
vmovups 32(%rsp), %xmm8
|
||||||
|
vmovups 48(%rsp), %xmm9
|
||||||
|
vmovups 64(%rsp), %xmm10
|
||||||
|
vmovups 80(%rsp), %xmm11
|
||||||
|
vmovups 96(%rsp), %xmm12
|
||||||
|
vmovups 112(%rsp), %xmm13
|
||||||
|
vmovups 128(%rsp), %xmm14
|
||||||
|
vmovups 144(%rsp), %xmm15
|
||||||
|
|
||||||
|
addq $STACKSIZE, %rsp
|
||||||
|
#endif
|
||||||
|
|
||||||
|
popq %r12
|
||||||
|
popq %r13
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
popq %r14
|
||||||
|
popq %r15
|
||||||
|
#endif
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,374 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
|
#define M ARG1 /* rdi */
|
||||||
|
#define N ARG2 /* rsi */
|
||||||
|
#define A ARG3 /* rdx */
|
||||||
|
#define LDA ARG4 /* rcx */
|
||||||
|
#define B ARG5 /* r8 */
|
||||||
|
|
||||||
|
#define I %r10
|
||||||
|
#define J %rbp
|
||||||
|
|
||||||
|
#define AO1 %r9
|
||||||
|
#define AO2 %r15
|
||||||
|
#define AO3 %r11
|
||||||
|
#define AO4 %r14
|
||||||
|
#define BO1 %r13
|
||||||
|
#define M8 %rbx
|
||||||
|
#define BO %rax
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define STACKSIZE 256
|
||||||
|
|
||||||
|
#define M ARG1 /* rcx */
|
||||||
|
#define N ARG2 /* rdx */
|
||||||
|
#define A ARG3 /* r8 */
|
||||||
|
#define LDA ARG4 /* r9 */
|
||||||
|
#define OLD_B 40 + 64 + STACKSIZE(%rsp)
|
||||||
|
|
||||||
|
#define B %rdi
|
||||||
|
|
||||||
|
#define I %r10
|
||||||
|
#define J %r11
|
||||||
|
|
||||||
|
#define AO1 %r12
|
||||||
|
#define AO2 %r13
|
||||||
|
#define AO3 %r14
|
||||||
|
#define AO4 %r15
|
||||||
|
|
||||||
|
#define BO1 %rsi
|
||||||
|
#define M8 %rbp
|
||||||
|
#define BO %rax
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
pushq %rdi
|
||||||
|
pushq %rsi
|
||||||
|
#endif
|
||||||
|
pushq %r15
|
||||||
|
pushq %r14
|
||||||
|
pushq %r13
|
||||||
|
pushq %r12
|
||||||
|
pushq %rbp
|
||||||
|
pushq %rbx
|
||||||
|
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
subq $STACKSIZE, %rsp
|
||||||
|
|
||||||
|
vmovups %xmm6, 0(%rsp)
|
||||||
|
vmovups %xmm7, 16(%rsp)
|
||||||
|
vmovups %xmm8, 32(%rsp)
|
||||||
|
vmovups %xmm9, 48(%rsp)
|
||||||
|
vmovups %xmm10, 64(%rsp)
|
||||||
|
vmovups %xmm11, 80(%rsp)
|
||||||
|
vmovups %xmm12, 96(%rsp)
|
||||||
|
vmovups %xmm13, 112(%rsp)
|
||||||
|
vmovups %xmm14, 128(%rsp)
|
||||||
|
vmovups %xmm15, 144(%rsp)
|
||||||
|
|
||||||
|
movq OLD_B, B
|
||||||
|
#endif
|
||||||
|
|
||||||
|
movq N, %rax
|
||||||
|
andq $-2, %rax
|
||||||
|
imulq M, %rax
|
||||||
|
|
||||||
|
leaq (B, %rax, SIZE), BO1
|
||||||
|
|
||||||
|
leaq (, LDA, SIZE), LDA
|
||||||
|
leaq (, M, SIZE), M8
|
||||||
|
|
||||||
|
movq M, J
|
||||||
|
sarq $1, J
|
||||||
|
jle .L20
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L01:
|
||||||
|
movq A, AO1
|
||||||
|
leaq (A, LDA ), AO2
|
||||||
|
leaq (A, LDA, 2), A
|
||||||
|
|
||||||
|
movq B, BO
|
||||||
|
addq $4 * SIZE, B
|
||||||
|
|
||||||
|
movq N, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L10
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L08:
|
||||||
|
#ifndef DOUBLE
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 2 * SIZE(AO1), %xmm2
|
||||||
|
vmovsd 4 * SIZE(AO1), %xmm4
|
||||||
|
vmovsd 6 * SIZE(AO1), %xmm6
|
||||||
|
vmovsd 0 * SIZE(AO2), %xmm1
|
||||||
|
vmovsd 2 * SIZE(AO2), %xmm3
|
||||||
|
vmovsd 4 * SIZE(AO2), %xmm5
|
||||||
|
vmovsd 6 * SIZE(AO2), %xmm7
|
||||||
|
|
||||||
|
vmovsd %xmm0, 0 * SIZE(BO)
|
||||||
|
vmovsd %xmm1, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovsd %xmm2, 0 * SIZE(BO)
|
||||||
|
vmovsd %xmm3, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovsd %xmm4, 0 * SIZE(BO)
|
||||||
|
vmovsd %xmm5, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovsd %xmm6, 0 * SIZE(BO)
|
||||||
|
vmovsd %xmm7, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
prefetchnta 256(AO1)
|
||||||
|
prefetchnta 256(AO2)
|
||||||
|
vmovups 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovups 2 * SIZE(AO1), %xmm2
|
||||||
|
vmovups 4 * SIZE(AO1), %xmm4
|
||||||
|
vmovups 6 * SIZE(AO1), %xmm6
|
||||||
|
vmovups 0 * SIZE(AO2), %xmm1
|
||||||
|
vmovups 2 * SIZE(AO2), %xmm3
|
||||||
|
vmovups 4 * SIZE(AO2), %xmm5
|
||||||
|
vmovups 6 * SIZE(AO2), %xmm7
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(BO)
|
||||||
|
vmovups %xmm1, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovups %xmm2, 0 * SIZE(BO)
|
||||||
|
vmovups %xmm3, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovups %xmm4, 0 * SIZE(BO)
|
||||||
|
vmovups %xmm5, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovups %xmm6, 0 * SIZE(BO)
|
||||||
|
vmovups %xmm7, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $8 * SIZE, AO1
|
||||||
|
addq $8 * SIZE, AO2
|
||||||
|
decq I
|
||||||
|
jg .L08
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
testq $4, N
|
||||||
|
jle .L12
|
||||||
|
#ifndef DOUBLE
|
||||||
|
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 2 * SIZE(AO1), %xmm2
|
||||||
|
vmovsd 0 * SIZE(AO2), %xmm1
|
||||||
|
vmovsd 2 * SIZE(AO2), %xmm3
|
||||||
|
|
||||||
|
vmovsd %xmm0, 0 * SIZE(BO)
|
||||||
|
vmovsd %xmm1, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovsd %xmm2, 0 * SIZE(BO)
|
||||||
|
vmovsd %xmm3, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
vmovups 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovups 2 * SIZE(AO1), %xmm2
|
||||||
|
vmovups 0 * SIZE(AO2), %xmm1
|
||||||
|
vmovups 2 * SIZE(AO2), %xmm3
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(BO)
|
||||||
|
vmovups %xmm1, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
vmovups %xmm2, 0 * SIZE(BO)
|
||||||
|
vmovups %xmm3, 2 * SIZE(BO)
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $4 * SIZE, AO1
|
||||||
|
addq $4 * SIZE, AO2
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L12:
|
||||||
|
testq $2, N
|
||||||
|
jle .L14
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd 0 * SIZE(AO2), %xmm1
|
||||||
|
|
||||||
|
vmovsd %xmm0, 0 * SIZE(BO)
|
||||||
|
vmovsd %xmm1, 2 * SIZE(BO)
|
||||||
|
#else
|
||||||
|
vmovups 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovups 0 * SIZE(AO2), %xmm1
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(BO)
|
||||||
|
vmovups %xmm1, 2 * SIZE(BO)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
addq $2 * SIZE, AO1
|
||||||
|
addq $2 * SIZE, AO2
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L14:
|
||||||
|
testq $1, N
|
||||||
|
jle .L19
|
||||||
|
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovss 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovss 0 * SIZE(AO2), %xmm1
|
||||||
|
|
||||||
|
vmovss %xmm0, 0 * SIZE(BO1)
|
||||||
|
vmovss %xmm1, 1 * SIZE(BO1)
|
||||||
|
#else
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0
|
||||||
|
|
||||||
|
vmovups %xmm0, 0 * SIZE(BO1)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $2 * SIZE, BO1
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L19:
|
||||||
|
decq J
|
||||||
|
jg .L01
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
testq $1, M
|
||||||
|
jle .L999
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L31:
|
||||||
|
movq A, AO1
|
||||||
|
movq B, BO
|
||||||
|
|
||||||
|
movq N, I
|
||||||
|
sarq $1, I
|
||||||
|
jle .L33
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L32:
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd %xmm0, 0 * SIZE(BO)
|
||||||
|
#else
|
||||||
|
vmovups 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovups %xmm0, 0 * SIZE(BO)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
addq $2 * SIZE, AO1
|
||||||
|
leaq (BO, M8, 2), BO
|
||||||
|
decq I
|
||||||
|
jg .L32
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L33:
|
||||||
|
testq $1, N
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
#ifndef DOUBLE
|
||||||
|
vmovss 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovss %xmm0, 0 * SIZE(BO1)
|
||||||
|
#else
|
||||||
|
vmovsd 0 * SIZE(AO1), %xmm0
|
||||||
|
vmovsd %xmm0, 0 * SIZE(BO1)
|
||||||
|
#endif
|
||||||
|
addq $1 * SIZE, BO1
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
vmovups 0(%rsp), %xmm6
|
||||||
|
vmovups 16(%rsp), %xmm7
|
||||||
|
vmovups 32(%rsp), %xmm8
|
||||||
|
vmovups 48(%rsp), %xmm9
|
||||||
|
vmovups 64(%rsp), %xmm10
|
||||||
|
vmovups 80(%rsp), %xmm11
|
||||||
|
vmovups 96(%rsp), %xmm12
|
||||||
|
vmovups 112(%rsp), %xmm13
|
||||||
|
vmovups 128(%rsp), %xmm14
|
||||||
|
vmovups 144(%rsp), %xmm15
|
||||||
|
|
||||||
|
addq $STACKSIZE, %rsp
|
||||||
|
#endif
|
||||||
|
|
||||||
|
popq %rbx
|
||||||
|
popq %rbp
|
||||||
|
popq %r12
|
||||||
|
popq %r13
|
||||||
|
popq %r14
|
||||||
|
popq %r15
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
popq %rsi
|
||||||
|
popq %rdi
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
|
@ -47,7 +47,7 @@
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define STACKSIZE 64
|
#define STACKSIZE 128
|
||||||
|
|
||||||
#define OLD_M %rdi
|
#define OLD_M %rdi
|
||||||
#define OLD_N %rsi
|
#define OLD_N %rsi
|
||||||
|
@ -58,10 +58,14 @@
|
||||||
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
||||||
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 48 (%rsp)
|
#define ALPHA 48 (%rsp)
|
||||||
|
#define MMM 56(%rsp)
|
||||||
|
#define NN 64(%rsp)
|
||||||
|
#define AA 72(%rsp)
|
||||||
|
#define LDAX 80(%rsp)
|
||||||
|
#define XX 96(%rsp)
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define STACKSIZE 256
|
#define STACKSIZE 288
|
||||||
|
|
||||||
#define OLD_M %rcx
|
#define OLD_M %rcx
|
||||||
#define OLD_N %rdx
|
#define OLD_N %rdx
|
||||||
|
@ -74,6 +78,12 @@
|
||||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||||
#define ALPHA 224 (%rsp)
|
#define ALPHA 224 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 232(%rsp)
|
||||||
|
#define NN 240(%rsp)
|
||||||
|
#define AA 248(%rsp)
|
||||||
|
#define LDAX 256(%rsp)
|
||||||
|
#define XX 264(%rsp)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LDA %r8
|
#define LDA %r8
|
||||||
|
@ -137,17 +147,41 @@
|
||||||
movq OLD_LDA, LDA
|
movq OLD_LDA, LDA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
movq STACK_INCX, INCX
|
|
||||||
movq STACK_Y, Y
|
|
||||||
movq STACK_INCY, INCY
|
|
||||||
movq STACK_BUFFER, BUFFER
|
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
movss %xmm0, ALPHA
|
movss %xmm0, ALPHA
|
||||||
#else
|
#else
|
||||||
movss %xmm3, ALPHA
|
movss %xmm3, ALPHA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
movq M,MMM
|
||||||
|
movq A,AA
|
||||||
|
movq N,NN
|
||||||
|
movq LDA,LDAX
|
||||||
|
movq X,XX
|
||||||
|
movq STACK_Y, Y
|
||||||
|
.L0t:
|
||||||
|
xorq I,I
|
||||||
|
addq $1,I
|
||||||
|
salq $22,I
|
||||||
|
subq I,MMM
|
||||||
|
movq I,M
|
||||||
|
jge .L00t
|
||||||
|
|
||||||
|
movq MMM,M
|
||||||
|
addq I,M
|
||||||
|
jle .L999x
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movq AA,A
|
||||||
|
movq NN,N
|
||||||
|
movq LDAX,LDA
|
||||||
|
movq XX,X
|
||||||
|
|
||||||
|
movq STACK_INCX, INCX
|
||||||
|
movq STACK_INCY, INCY
|
||||||
|
movq STACK_BUFFER, BUFFER
|
||||||
|
|
||||||
leaq (,INCX, SIZE), INCX
|
leaq (,INCX, SIZE), INCX
|
||||||
leaq (,INCY, SIZE), INCY
|
leaq (,INCY, SIZE), INCY
|
||||||
leaq (,LDA, SIZE), LDA
|
leaq (,LDA, SIZE), LDA
|
||||||
|
@ -5990,6 +6024,12 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
leaq (,M,SIZE),%rax
|
||||||
|
addq %rax,AA
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
movq 0(%rsp), %rbx
|
movq 0(%rsp), %rbx
|
||||||
movq 8(%rsp), %rbp
|
movq 8(%rsp), %rbp
|
||||||
movq 16(%rsp), %r12
|
movq 16(%rsp), %r12
|
||||||
|
|
|
@ -63,7 +63,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define STACKSIZE 256
|
#define STACKSIZE 288
|
||||||
|
|
||||||
#define OLD_M %rcx
|
#define OLD_M %rcx
|
||||||
#define OLD_N %rdx
|
#define OLD_N %rdx
|
||||||
|
@ -74,10 +74,10 @@
|
||||||
#define STACK_Y 72 + STACKSIZE(%rsp)
|
#define STACK_Y 72 + STACKSIZE(%rsp)
|
||||||
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
||||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||||
#define MMM 216(%rsp)
|
#define MMM 232(%rsp)
|
||||||
#define NN 224(%rsp)
|
#define NN 240(%rsp)
|
||||||
#define AA 232(%rsp)
|
#define AA 248(%rsp)
|
||||||
#define LDAX 240(%rsp)
|
#define LDAX 256(%rsp)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1385,7 +1385,7 @@ ALIGN_5
|
||||||
EXTRA_DY $1, yvec15, xvec7;
|
EXTRA_DY $1, yvec15, xvec7;
|
||||||
EXTRA_DY $1, yvec14, xvec6;
|
EXTRA_DY $1, yvec14, xvec6;
|
||||||
EXTRA_DY $1, yvec13, xvec5;
|
EXTRA_DY $1, yvec13, xvec5;
|
||||||
EXTRA_DY $2, yvec12, xvec4;
|
EXTRA_DY $1, yvec12, xvec4;
|
||||||
#ifndef TRMMKERNEL
|
#ifndef TRMMKERNEL
|
||||||
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
LDL_DX 0*SIZE(C0), xvec0, xvec0;
|
||||||
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
LDH_DX 1*SIZE(C0), xvec0, xvec0;
|
||||||
|
@ -1406,8 +1406,8 @@ STL_DX xvec7, 2*SIZE(C0, ldc, 1);
|
||||||
STH_DX xvec7, 3*SIZE(C0, ldc, 1);
|
STH_DX xvec7, 3*SIZE(C0, ldc, 1);
|
||||||
STL_DX xvec13, 0*SIZE(C0, ldc, 1);
|
STL_DX xvec13, 0*SIZE(C0, ldc, 1);
|
||||||
STH_DX xvec13, 1*SIZE(C0, ldc, 1);
|
STH_DX xvec13, 1*SIZE(C0, ldc, 1);
|
||||||
STL_DX xvec6, 2*SIZE(C0);
|
STL_DX xvec5, 2*SIZE(C0);
|
||||||
STH_DX xvec6, 3*SIZE(C0);
|
STH_DX xvec5, 3*SIZE(C0);
|
||||||
#ifndef TRMMKERNEL
|
#ifndef TRMMKERNEL
|
||||||
LDL_DX 0*SIZE(C1), xvec0, xvec0;
|
LDL_DX 0*SIZE(C1), xvec0, xvec0;
|
||||||
LDH_DX 1*SIZE(C1), xvec0, xvec0;
|
LDH_DX 1*SIZE(C1), xvec0, xvec0;
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define STACKSIZE 64
|
#define STACKSIZE 128
|
||||||
|
|
||||||
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
#define OLD_INCX 8 + STACKSIZE(%rsp)
|
||||||
#define OLD_Y 16 + STACKSIZE(%rsp)
|
#define OLD_Y 16 + STACKSIZE(%rsp)
|
||||||
|
@ -51,6 +51,14 @@
|
||||||
#define ALPHA_R 48 (%rsp)
|
#define ALPHA_R 48 (%rsp)
|
||||||
#define ALPHA_I 56 (%rsp)
|
#define ALPHA_I 56 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 64(%rsp)
|
||||||
|
#define NN 72(%rsp)
|
||||||
|
#define AA 80(%rsp)
|
||||||
|
#define XX 88(%rsp)
|
||||||
|
#define LDAX 96(%rsp)
|
||||||
|
#define ALPHAR 104(%rsp)
|
||||||
|
#define ALPHAI 112(%rsp)
|
||||||
|
|
||||||
#define M %rdi
|
#define M %rdi
|
||||||
#define N %rsi
|
#define N %rsi
|
||||||
#define A %rcx
|
#define A %rcx
|
||||||
|
@ -62,7 +70,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define STACKSIZE 256
|
#define STACKSIZE 288
|
||||||
|
|
||||||
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
|
||||||
#define OLD_A 48 + STACKSIZE(%rsp)
|
#define OLD_A 48 + STACKSIZE(%rsp)
|
||||||
|
@ -75,6 +83,14 @@
|
||||||
#define ALPHA_R 224 (%rsp)
|
#define ALPHA_R 224 (%rsp)
|
||||||
#define ALPHA_I 232 (%rsp)
|
#define ALPHA_I 232 (%rsp)
|
||||||
|
|
||||||
|
#define MMM 232(%rsp)
|
||||||
|
#define NN 240(%rsp)
|
||||||
|
#define AA 248(%rsp)
|
||||||
|
#define XX 256(%rsp)
|
||||||
|
#define LDAX 264(%rsp)
|
||||||
|
#define ALPHAR 272(%rsp)
|
||||||
|
#define ALPHAI 280(%rsp)
|
||||||
|
|
||||||
#define M %rcx
|
#define M %rcx
|
||||||
#define N %rdx
|
#define N %rdx
|
||||||
#define A %r8
|
#define A %r8
|
||||||
|
@ -136,8 +152,37 @@
|
||||||
movsd OLD_ALPHA_I, %xmm1
|
movsd OLD_ALPHA_I, %xmm1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
movq OLD_INCX, INCX
|
movq A, AA
|
||||||
|
movq N, NN
|
||||||
|
movq M, MMM
|
||||||
|
movq LDA, LDAX
|
||||||
|
movq X, XX
|
||||||
movq OLD_Y, Y
|
movq OLD_Y, Y
|
||||||
|
movsd %xmm0,ALPHAR
|
||||||
|
movsd %xmm1,ALPHAI
|
||||||
|
|
||||||
|
.L0t:
|
||||||
|
xorq I,I
|
||||||
|
addq $1,I
|
||||||
|
salq $18,I
|
||||||
|
subq I,MMM
|
||||||
|
movq I,M
|
||||||
|
movsd ALPHAR,%xmm0
|
||||||
|
movsd ALPHAI,%xmm1
|
||||||
|
jge .L00t
|
||||||
|
|
||||||
|
movq MMM,M
|
||||||
|
addq I,M
|
||||||
|
jle .L999x
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movq AA, A
|
||||||
|
movq NN, N
|
||||||
|
movq LDAX, LDA
|
||||||
|
movq XX, X
|
||||||
|
|
||||||
|
movq OLD_INCX, INCX
|
||||||
|
# movq OLD_Y, Y
|
||||||
movq OLD_INCY, INCY
|
movq OLD_INCY, INCY
|
||||||
movq OLD_BUFFER, BUFFER
|
movq OLD_BUFFER, BUFFER
|
||||||
|
|
||||||
|
@ -2673,6 +2718,12 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movq M, I
|
||||||
|
salq $ZBASE_SHIFT,I
|
||||||
|
addq I,AA
|
||||||
|
jmp .L0t
|
||||||
|
.L999x:
|
||||||
|
|
||||||
movq 0(%rsp), %rbx
|
movq 0(%rsp), %rbx
|
||||||
movq 8(%rsp), %rbp
|
movq 8(%rsp), %rbp
|
||||||
movq 16(%rsp), %r12
|
movq 16(%rsp), %r12
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue