diff --git a/.gitignore b/.gitignore index 6cfc5b3c1..aaa1b31ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,25 @@ *.obj *.lib *.dll +*.dylib *.def *.o lapack-3.1.1 lapack-3.1.1.tgz +lapack-3.4.1 +lapack-3.4.1.tgz +lapack-3.4.2 +lapack-3.4.2.tgz *.so *.a .svn *~ +lib.grd +nohup.out config.h Makefile.conf +Makefile.conf_last +config_last.h getarch getarch_2nd utest/openblas_utest diff --git a/Changelog.txt b/Changelog.txt index e122300ec..54b11ad81 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,115 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.6 +2-Mar-2013 +common: + * Improved OpenMP performance slightly. (d744c9) + * Improved cblas.h compatibility with Intel MKL.(#185) + * Fixed the overflowing bug in single thread cholesky factorization. + * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) + +x86/x86-64: + * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) + We will tune the performance in future. + * Auto-detect Intel Xeon E7540. + * Fixed the overflowing buffer bug of gemv. (#173) + * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) + +MIPS64: + +==================================================================== +Version 0.2.5 +26-Nov-2012 +common: + * Added NO_SHARED flag to disable generating the shared library. + * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) + * Export LAPACK 3.4.2 symbols in shared library. (#147) + * Only detect the number of physical CPU cores on Mac OSX. (#157) + * Fixed NetBSD build. (#155) + * Fixed compilation with TARGET=GENERIC. (#160) +x86/x86-64: + * Restore the original CPU affinity when calling + openblas_set_num_threads(1) (#153) + * Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154) +MIPS64: + +==================================================================== +Version 0.2.4 +8-Oct-2012 +common: + * Upgraded LAPACK to 3.4.2 version. (#145) + * Provided support for passing CFLAGS, FFLAGS, PFLAGS, + FPFLAGS to make. (#137) + * f77blas.h:compatibility for compilers without C99 complex + number support. (#141) +x86/x86-64: + * Added NO_AVX flag. Check OS supporting AVX on runtime. (#139) + * Fixed zdot incompatibility ABI issue with GCC 4.7 on + Windows 32-bit. (#140) +MIPS64: + * Fixed the generation of shared library bug. + * Fixed the detection bug on the Loongson 3A server. +==================================================================== +Version 0.2.3 +20-Aug-2012 +common: + * Fixed LAPACK unstable bug about ?laswp. (#130) + * Fixed the shared library bug about unloading the library on + Linux (#132). + * Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2) + Please use gcc and IBM xlf. (#134) +x86/x86-64: + * Supported goto_set_num_threads and openblas_set_num_threads + APIs in Windows. They can set the number of threads on runtime. + +==================================================================== +Version 0.2.2 +6-July-2012 +common: + * Fixed exporting DLL functions bug on Windows/MingW + * Support GNU Hurd (Thank Sylvestre Ledru) + * Support kfreebsd kernel (Thank Sylvestre Ledru) +x86/x86-64: + * Support Intel Sandy Bridge 22nm desktop/mobile CPU +SPARC: + * Improve the detection of SPARC (Thank Sylvestre Ledru) + +==================================================================== +Version 0.2.1 +30-Jun-2012 +common: +x86/x86-64: + * Fixed the SEGFAULT bug about hyper-theading + * Support AMD Bulldozer by using GotoBLAS2 AMD Barcelona codes + +==================================================================== +Version 0.2.0 +26-Jun-2012 +common: + * Removed the limitation (64) of numbers of CPU cores. + Now, it supports 256 cores at max. + * Supported clang compiler. + * Fixed some build bugs on FreeBSD +x86/x86-64: + * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. + Please use gcc >= 4.6 or clang >=3.1. + * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. + +==================================================================== +Version 0.1.1 +29-Apr-2012 +common: + * Upgraded LAPACK to 3.4.1 version. (Thank Zaheer Chothia) + * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) + * Fixed the build bug (MD5 and download) on Mac OSX. + * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. + * Fxied the compatibility issue for compilers without C99 complex number + (e.g. Visual Studio) +x86/x86_64: + * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. + * Test alpha=Nan in dscale. + * Fixed a SEGFAULT bug in samax on x86 windows. + ==================================================================== Version 0.1.0 23-Mar-2012 diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index b6033fe53..be623d608 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -90,6 +90,15 @@ number of threads will consume extra resource. I recommend you to specify minimum number of threads. +1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? + + A This may be related to a bug in the Linux kernel 2.6.32. Try applying + the patch segaults.patch using + + patch < segfaults.patch + + and see if the crashes persist. Note that this patch will lead to many + compiler warnings. 2. Architecture Specific issue or Implementation diff --git a/LICENSE b/LICENSE index f5e4f35a7..1e93a6a73 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/Makefile b/Makefile index ba04aa989..bde7cf376 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ include ./Makefile.system BLASDIRS = interface driver/level2 driver/level3 driver/others -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH), 1) BLASDIRS += kernel endif @@ -26,7 +26,7 @@ endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench -.PHONY : all libs netlib test ctest shared install +.PHONY : all libs netlib test ctest shared install .NOTPARALLEL : all libs prof lapack-test install all :: libs netlib tests shared @@ -80,6 +80,7 @@ endif @echo shared : +ifndef NO_SHARED ifeq ($(OSNAME), Linux) $(MAKE) -C exports so -ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -99,11 +100,10 @@ ifeq ($(OSNAME), Darwin) endif ifeq ($(OSNAME), WINNT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll +endif endif tests : @@ -147,7 +147,7 @@ ifeq ($(EXPRECISION), 1) echo "#define EXPRECISION">> config_last.h endif ## -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ @@ -165,7 +165,7 @@ prof_blas : $(MAKE) -C $$d prof || exit 1 ; \ fi; \ done -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonprof || exit 1 endif @@ -184,7 +184,7 @@ hpl : $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ @@ -203,47 +203,73 @@ ifeq ($(NO_LAPACK), 1) netlib : else -netlib : lapack-3.4.0 patch.for_lapack-3.4.0 lapack-3.4.0/make.inc +netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc ifndef NOFORTRAN - -@$(MAKE) -C lapack-3.4.0 lapacklib + -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib +endif +ifndef NO_LAPACKE + -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib endif endif -prof_lapack : lapack-3.4.0 lapack-3.4.0/make.inc - -@$(MAKE) -C lapack-3.4.0 lapack_prof +prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc + -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof -lapack-3.4.0/make.inc : +$(NETLIB_LAPACK_DIR)/make.inc : ifndef NOFORTRAN - -@echo "FORTRAN = $(FC)" > lapack-3.4.0/make.inc - -@echo "OPTS = $(FFLAGS)" >> lapack-3.4.0/make.inc - -@echo "POPTS = $(FPFLAGS)" >> lapack-3.4.0/make.inc - -@echo "NOOPT = $(FFLAGS) -O0" >> lapack-3.4.0/make.inc - -@echo "PNOOPT = $(FPFLAGS) -O0" >> lapack-3.4.0/make.inc - -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> lapack-3.4.0/make.inc - -@echo "ARCH = $(AR)" >> lapack-3.4.0/make.inc - -@echo "RANLIB = $(RANLIB)" >> lapack-3.4.0/make.inc - -@echo "LAPACKLIB = ../$(LIBNAME)" >> lapack-3.4.0/make.inc - -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> lapack-3.4.0/make.inc - -@echo "SUFFIX = $(SUFFIX)" >> lapack-3.4.0/make.inc - -@echo "PSUFFIX = $(PSUFFIX)" >> lapack-3.4.0/make.inc -# -@echo "CEXTRALIB = $(CEXTRALIB)" >> lapack-3.4.0/make.inc - -@cat make.inc >> lapack-3.4.0/make.inc + -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc + -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc +ifdef INTERFACE64 + -@echo "override CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc +else + -@echo "override CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif + -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif -lapack-3.4.0 : lapack-3.4.0.tgz +lapack-3.4.2 : lapack-3.4.2.tgz ifndef NOFORTRAN - @if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ +ifndef NO_LAPACK + @if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \ echo $(TAR) zxf $< ;\ - $(TAR) zxf $< && (cd lapack-3.4.0; $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ + $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\ + rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ else \ - echo " lapack-3.4.0.tgz check sum is wrong (Please use orignal)." ;\ - rm -rf lapack-3.4.0 ;\ + rm -rf $(NETLIB_LAPACK_DIR) ;\ + echo " Cannot download lapack-3.4.2.tgz or the MD5 check sum is wrong (Please use orignal)."; \ + exit 1; \ fi endif +endif -lapack-3.4.0.tgz : +LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.2.tgz + +lapack-3.4.2.tgz : ifndef NOFORTRAN - -wget http://www.netlib.org/lapack/lapack-3.4.0.tgz +#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or +ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) + curl -O $(LAPACK_URL) +else +ifeq ($(OSNAME), FreeBSD) + fetch $(LAPACK_URL) +else + wget -O $@ $(LAPACK_URL) +endif +endif endif large.tgz : @@ -256,21 +282,21 @@ ifndef NOFORTRAN -wget http://www.netlib.org/lapack/timing/timing.tgz endif -lapack-timing : lapack-3.4.0 large.tgz timing.tgz +lapack-timing : lapack-3.4.2 large.tgz timing.tgz ifndef NOFORTRAN - (cd lapack-3.4.0; $(TAR) zxf ../timing.tgz TIMING) - (cd lapack-3.4.0/TIMING; $(TAR) zxf ../../large.tgz ) - make -C lapack-3.4.0 tmglib - make -C lapack-3.4.0/TIMING + (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) + (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) + make -C $(NETLIB_LAPACK_DIR) tmglib + make -C $(NETLIB_LAPACK_DIR)/TIMING endif lapack-test : - $(MAKE) -C lapack-3.4.0 tmglib - $(MAKE) -C lapack-3.4.0/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc - @rm -f lapack-3.4.0/TESTING/*.out - $(MAKE) -j 1 -C lapack-3.4.0/TESTING - $(GREP) failed lapack-3.4.0/TESTING/*.out + $(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib + $(MAKE) -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc + @rm -f $(NETLIB_LAPACK_DIR)/TESTING/*.out + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING + $(GREP) failed $(NETLIB_LAPACK_DIR)/TESTING/*.out dummy : @@ -288,10 +314,10 @@ clean :: #endif @$(MAKE) -C reference clean @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h - @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib - @if test -d lapack-3.4.0; then \ - echo deleting lapack-3.4.0; \ - rm -rf lapack-3.4.0 ;\ + @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib + @if test -d $(NETLIB_LAPACK_DIR); then \ + echo deleting $(NETLIB_LAPACK_DIR); \ + rm -rf $(NETLIB_LAPACK_DIR) ;\ fi @rm -f *.grd Makefile.conf_last config_last.h - @echo Done. \ No newline at end of file + @echo Done. diff --git a/Makefile.install b/Makefile.install index 46105fc39..87730a10c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -23,7 +23,7 @@ install : lib.grd @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR) @echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h @@ -32,8 +32,18 @@ install : lib.grd @cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h +ifndef NO_CBLAS @echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR) @sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h +endif + +ifndef NO_LAPACKE + @echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR) + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h +endif #for install static library @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) @@ -61,11 +71,9 @@ ifeq ($(OSNAME), Darwin) endif ifeq ($(OSNAME), WINNT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif @echo Install OK! diff --git a/Makefile.getarch b/Makefile.prebuild similarity index 80% rename from Makefile.getarch rename to Makefile.prebuild index dadfb5b1b..c7d0de70e 100644 --- a/Makefile.getarch +++ b/Makefile.prebuild @@ -1,3 +1,5 @@ +# This is triggered by Makefile.system and runs before any of the code is built. + export BINARY export USE_OPENMP @@ -15,7 +17,7 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif -all: getarch_2nd +all: getarch_2nd cblas_noconst.h ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) @@ -36,4 +38,7 @@ else $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c endif +cblas_noconst.h : cblas.h + perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h + dummy: diff --git a/Makefile.rule b/Makefile.rule index 650478a07..4e238575a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1.0 +VERSION = 0.2.6 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -24,10 +24,13 @@ VERSION = 0.1.0 # Fortran compiler. Default is g77. # FC = gfortran -# Even you can specify cross compiler +# Even you can specify cross compiler. Meanwhile, please set HOSTCC. # CC = x86_64-w64-mingw32-gcc # FC = x86_64-w64-mingw32-gfortran +# If you use the cross compiler, please set this host compiler. +# HOSTCC = gcc + # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 # BINARY=64 @@ -45,12 +48,19 @@ VERSION = 0.1.0 # automatically detected by the the script. # NUM_THREADS = 24 +# if you don't need generate the shared library, please comment it in. +# NO_SHARED = 1 + # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 -# If you don't need LAPACK, please comment it in. +# If you don't need LAPACK, please comment it in. +# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. # NO_LAPACK = 1 +# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. +# NO_LAPACKE = 1 + # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -67,6 +77,10 @@ VERSION = 0.1.0 # If you want to disable CPU/Memory affinity on Linux. # NO_AFFINITY = 1 +# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers +# and OS. However, the performance is low. +# NO_AVX = 1 + # If you would like to know minute performance report of GotoBLAS. # FUNCTION_PROFILE = 1 @@ -90,8 +104,8 @@ VERSION = 0.1.0 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. -# GEMM_MULTITHREAD_THRESHOLD = 4 +# in small matrix sizes. The default value is 50. +# GEMM_MULTITHREAD_THRESHOLD = 50 # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). @@ -104,19 +118,16 @@ VERSION = 0.1.0 # The installation directory. # PREFIX = /opt/OpenBLAS -# Common Optimization Flag; -O2 is enough. -# DEBUG = 1 - -ifeq ($(DEBUG), 1) -COMMON_OPT += -g -# -DDEBUG -else -COMMON_OPT += -O2 -endif +# Common Optimization Flag; +# The default -O2 is enough. +# COMMON_OPT = -O2 # Profiling flags COMMON_PROF = -pg +# Build Debug version +# DEBUG = 1 + # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index 0fd223d60..5f8c251b0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,8 +9,20 @@ ifndef TOPDIR TOPDIR = . endif +ifndef NETLIB_LAPACK_DIR +NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2 +endif + # Default C compiler +# - Only set if not specified on the command line or inherited from the environment. +# - CC is an implicit variable so neither '?=' or 'ifndef' can be used. +# http://stackoverflow.com/questions/4029274/mingw-and-make-variables +# - Default value is 'cc' which is not always a valid command (e.g. MinGW). +ifeq ($(origin CC),default) CC = gcc +endif + +# Default Fortran compiler (FC) is selected by f_check. ifndef MAKEFILE_RULE include $(TOPDIR)/Makefile.rule @@ -41,16 +53,24 @@ GETARCH_FLAGS += -DUSE64BITINT endif ifndef GEMM_MULTITHREAD_THRESHOLD -GEMM_MULTITHREAD_THRESHOLD=4 +GEMM_MULTITHREAD_THRESHOLD=50 endif GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) +ifeq ($(NO_AVX), 1) +GETARCH_FLAGS += -DNO_AVX +endif + +ifeq ($(DEBUG), 1) +GETARCH_FLAGS += -g +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -101,6 +121,15 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap ifeq ($(OSNAME), Darwin) export MACOSX_DEPLOYMENT_TARGET=10.2 +MD5SUM = md5 -r +endif + +ifeq ($(OSNAME), FreeBSD) +MD5SUM = md5 -r +endif + +ifeq ($(OSNAME), NetBSD) +MD5SUM = md5 -n endif ifeq ($(OSNAME), Linux) @@ -120,6 +149,26 @@ EXTRALIB += -defaultlib:advapi32 SUFFIX = obj PSUFFIX = pobj LIBSUFFIX = lib +ifeq ($(C_COMPILER), GCC) +#Test for supporting MS_ABI +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGT4), 1) +# GCC Majar version > 4 +# It is compatible with MSVC ABI. +CCOMMON_OPT += -DMS_ABI +endif + +ifeq ($(GCCVERSIONGTEQ4), 1) +ifeq ($(GCCMINORVERSIONGTEQ7), 1) +# GCC Version >=4.7 +# It is compatible with MSVC ABI. +CCOMMON_OPT += -DMS_ABI +endif +endif + +endif endif ifeq ($(OSNAME), Interix) @@ -223,14 +272,20 @@ endif endif -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +ifneq ($(NO_AVX), 1) +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER +endif endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +ifneq ($(NO_AVX), 1) +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER +endif endif ifndef DYNAMIC_CORE @@ -459,11 +514,28 @@ ifdef INTERFACE64 FCOMMON_OPT += -i8 endif endif + +ifeq ($(ARCH), mips64) +ifndef BINARY64 +FCOMMON_OPT += -n32 +else +FCOMMON_OPT += -n64 +endif +ifeq ($(CORE), LOONGSON3A) +FCOMMON_OPT += -loongson3 +endif + +ifeq ($(CORE), LOONGSON3B) +FCOMMON_OPT += -loongson3 +endif + +else ifndef BINARY64 FCOMMON_OPT += -m32 else FCOMMON_OPT += -m64 endif +endif ifdef USE_OPENMP FEXTRALIB += -lstdc++ @@ -472,12 +544,30 @@ endif endif ifeq ($(C_COMPILER), OPEN64) + +ifeq ($(ARCH), mips64) +ifndef BINARY64 +CCOMMON_OPT += -n32 +else +CCOMMON_OPT += -n64 +endif +ifeq ($(CORE), LOONGSON3A) +CCOMMON_OPT += -loongson3 +endif + +ifeq ($(CORE), LOONGSON3B) +CCOMMON_OPT += -loongson3 +endif + +else + ifndef BINARY64 CCOMMON_OPT += -m32 else CCOMMON_OPT += -m64 endif endif +endif ifeq ($(C_COMPILER), SUN) CCOMMON_OPT += -w @@ -533,6 +623,16 @@ endif ifeq ($(NO_LAPACK), 1) CCOMMON_OPT += -DNO_LAPACK +#Disable LAPACK C interface +NO_LAPACKE = 1 +endif + +ifeq ($(NO_LAPACKE), 1) +CCOMMON_OPT += -DNO_LAPACKE +endif + +ifeq ($(NO_AVX), 1) +CCOMMON_OPT += -DNO_AVX endif ifdef SMP @@ -651,17 +751,30 @@ PATCH = patch GREP = grep endif +ifndef MD5SUM MD5SUM = md5sum +endif + AWK = awk REVISION = -r$(VERSION) MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) -CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) +ifeq ($(DEBUG), 1) +COMMON_OPT += -g +endif -FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) -FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) +ifndef COMMON_OPT +COMMON_OPT = -O2 +endif + + +override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) +override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) + +override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) +override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) +#MAKEOVERRIDES = ifndef SUFFIX SUFFIX = o @@ -675,7 +788,7 @@ ifndef LIBSUFFIX LIBSUFFIX = a endif -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH), 1) ifndef SMP LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) @@ -694,8 +807,8 @@ endif endif +LIBDLLNAME = $(LIBPREFIX).dll LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) -LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) @@ -740,6 +853,7 @@ export HAVE_SSE4_1 export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 +export HAVE_AVX export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/Makefile.tail b/Makefile.tail index 64f98ab0c..53dd0caad 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -22,19 +22,19 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX -$(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX -$(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX -$(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX -$(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX -$(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX +$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX +$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX +$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX +$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX +$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX +$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX -$(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) libs :: $(BLASOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ diff --git a/README b/README deleted file mode 100644 index c8c2c2c55..000000000 --- a/README +++ /dev/null @@ -1,83 +0,0 @@ -OpenBLAS Readme - -1.Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) - -2.Intallation -Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, -check out codes from git://github.com/xianyi/OpenBLAS.git -1)Normal compile - (a) type "make" to detect the CPU automatically. - or - (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. - -2)Cross compile -Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. - -examples: -On X86 box, compile this library for loongson3a CPU. -make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A - -3)Debug version -make DEBUG=1 - -4)Intall to the directory (Optional) -e.g. -make install PREFIX=your_installation_directory -The default directory is /opt/OpenBLAS - -3.Support CPU & OS -Please read GotoBLAS_01Readme.txt - -Additional support CPU: -x86_64: - Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. -MIPS64: - ICT Loongson 3A //The initial version used GotoBLAS2 MIPS64 kernels. Thus, the performance is not good. - -4.Usages -Link with libopenblas.a or -lopenblas for shared library. - -4.1 Set the number of threads with environment variables. for example, -export OPENBLAS_NUM_THREADS=4 - or -export GOTO_NUM_THREADS=4 - or -export OMP_NUM_THREADS=4 - -The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. - -If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. - -4.2 Set the number of threads with calling functions. for example, -void goto_set_num_threads(int num_threads); -or -void openblas_set_num_threads(int num_threads); - -If you compile this lib with USE_OPENMP=1, you should use the above functions, too. - -5.Report Bugs -Please add a issue in https://github.com/xianyi/OpenBLAS/issues - -6.To-Do List: -Optimization on ICT Loongson 3A CPU - -7.Contact -OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas - -8.ChangeLog -Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. - -9.Known Issues -* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit - is 64. On 32 bits, it is 32. -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. - -10. Specification of Git Branches -We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). -Now, there are 4 branches in github.com. - * The master branch. This a main branch to reflect a production-ready state. - * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. - * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. - * The gh-pages branch. This is for web pages diff --git a/README.md b/README.md new file mode 100644 index 000000000..0e3a49530 --- /dev/null +++ b/README.md @@ -0,0 +1,117 @@ +# OpenBLAS + +## Introduction +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . + +Please read the documents on OpenBLAS wiki pages . + +## Installation +Download from project homepage. http://xianyi.github.com/OpenBLAS/ + +Or, check out codes from git://github.com/xianyi/OpenBLAS.git +### Normal compile + * type "make" to detect the CPU automatically. + or + * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. + +### Cross compile +Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. + +Examples: + +On X86 box, compile this library for loongson3a CPU. + + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + +On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. + + make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 + +### Debug version + + make DEBUG=1 + +### Install to the directory (Optional) + +Example: + + make install PREFIX=your_installation_directory + +The default directory is /opt/OpenBLAS + +## Support CPU & OS +Please read GotoBLAS_01Readme.txt + +### Additional support CPU: + +#### x86/x86-64: +- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. +- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. +- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. +- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) + +#### MIPS64: +- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. +- **ICT Loongson 3B**: Experimental + +### Support OS: +- **GNU/Linux** +- **MingWin/Windows**: Please read . +- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. +- **FreeBSD**: Supportted by community. We didn't test the library on this OS. + +## Usages +Link with libopenblas.a or -lopenblas for shared library. + +### Set the number of threads with environment variables. + +Examples: + + export OPENBLAS_NUM_THREADS=4 + + or + + export GOTO_NUM_THREADS=4 + + or + + export OMP_NUM_THREADS=4 + +The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. + +If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. + +### Set the number of threads on runtime. + +We provided the below functions to controll the number of threads on runtime. + + void goto_set_num_threads(int num_threads); + + void openblas_set_num_threads(int num_threads); + +If you compile this lib with USE_OPENMP=1, you should use the above functions, too. + +## Report Bugs +Please add a issue in https://github.com/xianyi/OpenBLAS/issues + +## Contact +OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas + +## ChangeLog +Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. + +## Troubleshooting +* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. +* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. +* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. + +## Specification of Git Branches +We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). +Now, there are 4 branches in github.com. + * The master branch. This a main branch to reflect a production-ready state. + * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. + * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. + * The gh-pages branch. This is for web pages diff --git a/TargetList.txt b/TargetList.txt index 1c3d7c5b9..ce35a3faa 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -8,8 +8,8 @@ Supported List: 1.X86/X86_64 a)Intel CPU: P2 -COPPERMINE KATMAI +COPPERMINE NORTHWOOD PRESCOTT BANIAS @@ -18,6 +18,7 @@ CORE2 PENRYN DUNNINGTON NEHALEM +SANDYBRIDGE ATOM b)AMD CPU: @@ -27,6 +28,8 @@ OPTERON_SSE3 BARCELONA SHANGHAI ISTANBUL +BOBCAT +BULLDOZER c)VIA CPU: SSE_GENERIC @@ -47,6 +50,7 @@ CELL 3.MIPS64 CPU: SICORTEX LOONGSON3A +LOONGSON3B 4.IA64 CPU: ITANIUM2 diff --git a/c_check b/c_check index 263efeb3d..4d82237d4 100644 --- a/c_check +++ b/c_check @@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/); $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); -$os = FreeBSD if ($data =~ /OS_FreeBSD/); -$os = NetBSD if ($data =~ /OS_NetBSD/); -$os = Darwin if ($data =~ /OS_Darwin/); -$os = SunOS if ($data =~ /OS_SunOS/); +$os = FreeBSD if ($data =~ /OS_FREEBSD/); +$os = NetBSD if ($data =~ /OS_NETBSD/); +$os = Darwin if ($data =~ /OS_DARWIN/); +$os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); -$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); +$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $architecture = x86 if ($data =~ /ARCH_X86/); @@ -174,6 +174,8 @@ $linker_a = ""; $link =~ s/\-Y\sP\,/\-Y/g; @flags = split(/[\s\,\n]/, $link); + # remove leading and trailing quotes from each flag. + @flags = map {s/^['"]|['"]$//g; $_} @flags; foreach $flags (@flags) { if ( diff --git a/cblas.h b/cblas.h index 34adc5e99..6684262e2 100644 --- a/cblas.h +++ b/cblas.h @@ -1,287 +1,293 @@ #ifndef CBLAS_H #define CBLAS_H +#include +#include "common.h" + #ifdef __cplusplus extern "C" { /* Assume C declarations for C++ */ #endif /* __cplusplus */ -#include -#include "common.h" +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); #define CBLAS_INDEX size_t -enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; -enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; -enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; -enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; -enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; -float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); -double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); -float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); -double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); +float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); -float _Complex cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -float _Complex cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -double _Complex cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -double _Complex cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); +openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); +void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); -float cblas_sasum (blasint n, float *x, blasint incx); -double cblas_dasum (blasint n, double *x, blasint incx); -float cblas_scasum(blasint n, float *x, blasint incx); -double cblas_dzasum(blasint n, double *x, blasint incx); +float cblas_sasum (const blasint n, const float *x, const blasint incx); +double cblas_dasum (const blasint n, const double *x, const blasint incx); +float cblas_scasum(const blasint n, const float *x, const blasint incx); +double cblas_dzasum(const blasint n, const double *x, const blasint incx); -float cblas_snrm2 (blasint N, float *X, blasint incX); -double cblas_dnrm2 (blasint N, double *X, blasint incX); -float cblas_scnrm2(blasint N, float *X, blasint incX); -double cblas_dznrm2(blasint N, double *X, blasint incX); +float cblas_snrm2 (const blasint N, const float *X, const blasint incX); +double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); +float cblas_scnrm2(const blasint N, const float *X, const blasint incX); +double cblas_dznrm2(const blasint N, const double *X, const blasint incX); -CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); -CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); +CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); -void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); -void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); -void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); -void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); +void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); -void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); -void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); +void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); -void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); -void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); +void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); +void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); -void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); -void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); +void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); +void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); -void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); +void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); -void cblas_sscal(blasint N, float alpha, float *X, blasint incX); -void cblas_dscal(blasint N, double alpha, double *X, blasint incX); -void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); -void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); -void cblas_csscal(blasint N, float alpha, float *X, blasint incX); -void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); +void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); +void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); +void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); +void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); -void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); -void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); -void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); -void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); +void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy); +void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy); +void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy); +void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy); -void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); -void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); -void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); +void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); -void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, - blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, - blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, - float *Y, blasint incY, float *A, blasint lda); -void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, - double *Y, blasint incY, double *A, blasint lda); +void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, + const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, + const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, + const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, + const double *Y, const blasint incY, double *A, const blasint lda); -void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); -void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); +void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); -void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); +void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); -void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, - blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, - blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, - float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, - double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, + const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, + const double *X, const blasint incX, const double beta, double *Y, const blasint incY); -void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); -void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); +void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); +void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); -void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); -void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); +void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); +void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); -void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); -void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); -void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); -void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); +void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); +void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); +void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); +void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); -void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); -void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); -void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); +void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); +void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); +void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); -void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); -void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); -void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); -void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); void cblas_xerbla(blasint p, char *rout, char *form, ...); #ifdef __cplusplus } - #endif /* __cplusplus */ #endif diff --git a/common.h b/common.h index e848f33f3..d46a5230a 100644 --- a/common.h +++ b/common.h @@ -68,7 +68,7 @@ extern "C" { #define SMP #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define WINDOWS_ABI #define OS_WINDOWS @@ -89,7 +89,7 @@ extern "C" { #include #endif -#ifdef OS_DARWIN +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) #include #endif @@ -351,7 +351,12 @@ typedef int blasint; #endif #define MMAP_ACCESS (PROT_READ | PROT_WRITE) + +#ifdef __NetBSD__ +#define MMAP_POLICY (MAP_PRIVATE | MAP_ANON) +#else #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) +#endif #include "param.h" #include "common_param.h" @@ -374,6 +379,31 @@ typedef int blasint; #endif #endif +#ifndef ASSEMBLER +#ifndef NOINCLUDE +/* Inclusion of a standard header file is needed for definition of __STDC_* + predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs + as a side effect of including either or . */ +#include +#endif // NOINCLUDE + +/* C99 supports complex floating numbers natively, which GCC also offers as an + extension since version 3.0. If neither are available, use a compatible + structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) + #define OPENBLAS_COMPLEX_C99 + typedef float _Complex openblas_complex_float; + typedef double _Complex openblas_complex_double; + typedef xdouble _Complex openblas_complex_xdouble; +#else + #define OPENBLAS_COMPLEX_STRUCT + typedef struct { float real, imag; } openblas_complex_float; + typedef struct { double real, imag; } openblas_complex_double; + typedef struct { xdouble real, imag; } openblas_complex_xdouble; +#endif +#endif // ASSEMBLER + #ifndef IFLUSH #define IFLUSH #endif @@ -528,7 +558,8 @@ typedef struct { #include "common_level3.h" #include "common_lapack.h" #ifdef CBLAS -#include "cblas.h" +/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ +#include "cblas_noconst.h" #endif #ifndef ASSEMBLER diff --git a/common_interface.h b/common_interface.h index 898d91001..14c2cf7a4 100644 --- a/common_interface.h +++ b/common_interface.h @@ -45,6 +45,8 @@ extern "C" { int BLASFUNC(xerbla)(char *, blasint *info, blasint); +void openblas_set_num_threads_(int *); + FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); @@ -74,19 +76,19 @@ myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); #elif defined RETURN_BY_STACK -void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *); -void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *); +void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); #else -float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); -float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); -double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); -double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); -xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); -xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); +openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); +openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); +openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); +openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); #endif void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); @@ -640,6 +642,8 @@ int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *, int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +/* Lapack routines */ + int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); @@ -675,6 +679,13 @@ int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); +int BLASFUNC(sgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); @@ -689,6 +700,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); diff --git a/common_linux.h b/common_linux.h index b0381d991..6766ff37c 100644 --- a/common_linux.h +++ b/common_linux.h @@ -86,7 +86,13 @@ static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned return syscall(SYS_set_mempolicy, mode, addr, flag); } -static inline int my_gettid(void) { return syscall(SYS_gettid); } +static inline int my_gettid(void) { +#ifdef SYS_gettid +return syscall(SYS_gettid); +#else +return getpid(); +#endif +} #endif #endif diff --git a/common_reference.h b/common_reference.h index 4cc4be4fd..be151e0d6 100644 --- a/common_reference.h +++ b/common_reference.h @@ -63,5 +63,7 @@ double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); + +FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *); #endif diff --git a/common_thread.h b/common_thread.h index dc963a635..97e060976 100644 --- a/common_thread.h +++ b/common_thread.h @@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) { int openmp_nthreads=0; #endif - if ((blas_cpu_number == 1) + if (blas_cpu_number == 1 #ifdef USE_OPENMP || omp_in_parallel() diff --git a/common_x86.h b/common_x86.h index fbb91f888..4316318ec 100644 --- a/common_x86.h +++ b/common_x86.h @@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define PROFCODE #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define SAVEREGISTERS \ subl $32, %esp;\ movups %xmm6, 0(%esp);\ @@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define RESTOREREGISTERS #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define PROLOGUE \ .text; \ .align 16; \ @@ -282,7 +282,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ @@ -356,4 +356,11 @@ REALNAME: #ifndef ALIGN_6 #define ALIGN_6 .align 64 + +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep +#define ffreep .byte 0xdf, 0xc0 # +#endif #endif diff --git a/common_x86_64.h b/common_x86_64.h index 53b702185..19b0ac53c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -353,7 +353,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ @@ -425,6 +425,7 @@ REALNAME: #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 +#define ALIGN_5 .align 5 #define ffreep fstp #endif @@ -448,4 +449,10 @@ REALNAME: #define ALIGN_6 .align 64 #endif +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep +#define ffreep .byte 0xdf, 0xc0 # +#endif #endif diff --git a/cpuid.h b/cpuid.h index 665ede077..c52d503cc 100644 --- a/cpuid.h +++ b/cpuid.h @@ -103,6 +103,9 @@ #define CORE_NEHALEM 17 #define CORE_ATOM 18 #define CORE_NANO 19 +#define CORE_SANDYBRIDGE 20 +#define CORE_BOBCAT 21 +#define CORE_BULLDOZER 22 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -122,6 +125,8 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) +#define HAVE_AVX (1 << 18) +#define HAVE_FMA4 (1 << 19) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -188,4 +193,7 @@ typedef struct { #define CPUTYPE_NSGEODE 41 #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 +#define CPUTYPE_SANDYBRIDGE 44 +#define CPUTYPE_BOBCAT 45 +#define CPUTYPE_BULLDOZER 46 #endif diff --git a/cpuid_mips.c b/cpuid_mips.c index 217492dd7..45171da5e 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -101,12 +101,14 @@ int detect(void){ fclose(infile); + if(p != NULL){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; }else if(strstr(p, "Loongson-3B")){ return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("system type", buffer, 11)){ p = strchr(buffer, ':') + 2; @@ -119,6 +121,24 @@ int detect(void){ }else{ return CPU_SICORTEX; } + } + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A")){ + return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; + } + } #endif return CPU_UNKNOWN; } diff --git a/cpuid_x86.c b/cpuid_x86.c index e183e9fc3..317774691 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -40,6 +40,13 @@ #include #include "cpuid.h" +#ifdef NO_AVX +#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM +#define CORE_SANDYBRIDGE CORE_NEHALEM +#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA +#define CORE_BULLDOZER CORE_BARCELONA +#endif + #ifndef CPUIDEMU #if defined(__APPLE__) && defined(__i386__) @@ -109,6 +116,33 @@ static inline int have_excpuid(void){ return eax & 0xffff; } +#ifndef NO_AVX +static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv + __asm__ __volatile__ + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} +#endif + +int support_avx(){ +#ifndef NO_AVX + int eax, ebx, ecx, edx; + int ret=0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 6) == 6){ + ret=1; //OS support AVX + } + } + return ret; +#else + return 0; +#endif +} + + int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; @@ -189,11 +223,17 @@ int get_cputype(int gettype){ if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; +#ifndef NO_AVX + if (support_avx()) feature |= HAVE_AVX; +#endif if (have_excpuid() >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; +#ifndef NO_AVX + if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; +#endif if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; } @@ -974,21 +1014,44 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: - switch (model) { - case 5: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - return CPUTYPE_NEHALEM; - case 10: - //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CPUTYPE_NEHALEM; - case 12: - //Xeon Processor 5600 (Westmere-EP) - return CPUTYPE_NEHALEM; - } - break; + case 2: + switch (model) { + case 5: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + return CPUTYPE_NEHALEM; + case 10: + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; //OS doesn't support AVX + case 12: + //Xeon Processor 5600 (Westmere-EP) + return CPUTYPE_NEHALEM; + case 13: + //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 14: + // Xeon E7540 + case 15: + //Xeon Processor E7 (Westmere-EX) + return CPUTYPE_NEHALEM; + } + break; + case 3: + switch (model) { + case 10: + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -1021,6 +1084,13 @@ int get_cpuname(void){ case 1: case 10: return CPUTYPE_BARCELONA; + case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CPUTYPE_BULLDOZER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + case 5: + return CPUTYPE_BOBCAT; } break; } @@ -1140,6 +1210,9 @@ static char *cpuname[] = { "NSGEODE", "VIAC3", "NANO", + "SANDYBRIDGE", + "BOBCAT", + "BULLDOZER", }; static char *lowercpuname[] = { @@ -1186,6 +1259,9 @@ static char *lowercpuname[] = { "tms3x00", "nsgeode", "nano", + "sandybridge", + "bobcat", + "bulldozer", }; static char *corename[] = { @@ -1209,6 +1285,9 @@ static char *corename[] = { "NEHALEM", "ATOM", "NANO", + "SANDYBRIDGE", + "BOBCAT", + "BULLDOZER", }; static char *corename_lower[] = { @@ -1232,6 +1311,9 @@ static char *corename_lower[] = { "nehalem", "atom", "nano", + "sandybridge", + "bobcat", + "bulldozer", }; @@ -1315,10 +1397,33 @@ int get_coretype(void){ return CORE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CORE_NEHALEM; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; //OS doesn't support AVX case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; + case 13: + //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; //OS doesn't support AVX + case 14: + //Xeon E7540 + case 15: + //Xeon Processor E7 (Westmere-EX) + return CORE_NEHALEM; + } + break; + case 3: + switch (model) { + case 10: + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; //OS doesn't support AVX } break; } @@ -1334,7 +1439,15 @@ int get_coretype(void){ if (family <= 0x5) return CORE_80486; if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; + if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; + else if (exfamily == 5) return CORE_BOBCAT; + else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CORE_BULLDOZER; + else + return CORE_BARCELONA; //OS don't support AVX. Use old kernels. + }else return CORE_BARCELONA; } } @@ -1400,6 +1513,9 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE %d\n", info.size * 1024); printf("#define DTB_ASSOCIATIVE %d\n", info.associative); printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); + } else { + //fall back for some virtual machines. + printf("#define DTB_DEFAULT_ENTRIES 32\n"); } features = get_cputype(GET_FEATURE); @@ -1414,8 +1530,10 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); + if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); + if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); @@ -1479,7 +1597,9 @@ void get_sse(void){ if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); + if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); + if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); } diff --git a/ctest.c b/ctest.c index 0c373bf2b..95a5e8bb2 100644 --- a/ctest.c +++ b/ctest.c @@ -34,20 +34,20 @@ COMPILER_GNU OS_LINUX #endif -#if defined(__FreeBSD__) -OS_FreeBSD +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +OS_FREEBSD #endif #if defined(__NetBSD__) -OS_NetBSD +OS_NETBSD #endif #if defined(__sun) -OS_SunOS +OS_SUNOS #endif #if defined(__APPLE__) -OS_Darwin +OS_DARWIN #endif #if defined(_AIX) @@ -63,13 +63,18 @@ OS_WINNT #endif #if defined(__CYGWIN__) -OS_CYGWIN +OS_CYGWIN_NT #endif #if defined(__INTERIX) OS_INTERIX #endif +#if defined(__gnu_hurd__) +/* Hurd is very similar to GNU/Linux, it should work out of the box */ +OS_LINUX +#endif + #if defined(__i386) || defined(_X86) ARCH_X86 #endif diff --git a/ctest/Makefile b/ctest/Makefile index 1e07bd154..b1295640f 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -5,7 +5,7 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system -CFLAGS += -DADD$(BU) -DCBLAS +override CFLAGS += -DADD$(BU) -DCBLAS LIB = $(TOPDIR)/$(LIBNAME) diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 222734d5e..7dfabfa81 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; - y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; @@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F n_from = 0; n_to = n; + //Use y as each thread's n* COMPSIZE elements in sb buffer + y = buffer; + buffer += ((COMPSIZE * n + 1023) & ~1023); + if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); @@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a += n_from * lda * COMPSIZE; } - if (range_n) y += *range_n * COMPSIZE; if (incx != 1) { COPY_K(n, x, incx, buffer, 1); @@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x if (num_cpu) { queue[0].sa = NULL; - queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; + queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); @@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #else ONE, ZERO, #endif - buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); } AXPYU_K(n, 0, 0, diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index f9007f831..3e11f9aba 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; -#if defined(LOONGSON3A) +#if 0 //defined(LOONGSON3A) queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; #else @@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( } if (num_cpu) { -#if defined(LOONGSON3A) +#if 0 //defined(LOONGSON3A) queue[0].sa = sa; queue[0].sb = sa + GEMM_OFFSET_A1 * 5; #else diff --git a/driver/others/Makefile b/driver/others/Makefile index 75b552b65..c449ec6c6 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,12 +1,12 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) ifndef NO_AFFINITY COMMONOBJS += init.$(SUFFIX) endif @@ -14,7 +14,7 @@ endif # COMMONOBJS += info.$(SUFFIX) -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) COMMONOBJS += dynamic.$(SUFFIX) else COMMONOBJS += parameter.$(SUFFIX) @@ -70,7 +70,7 @@ ifndef BLAS_SERVER BLAS_SERVER = blas_server.c endif -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) @@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_get_config.$(SUFFIX) : openblas_get_config.c + $(CC) $(CFLAGS) -c $< -o $(@F) + blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) @@ -215,7 +218,7 @@ info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) -hpl : CFLAGS += -DHPL -hpl_p : CFLAGS += -DHPL +hpl : override CFLAGS += -DHPL +hpl_p : override CFLAGS += -DHPL include $(TOPDIR)/Makefile.tail diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 66067a05c..2afcb742e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } #ifdef MONITOR @@ -435,7 +436,7 @@ static int blas_thread_server(void *arg){ blas_memory_free(buffer); - pthread_exit(NULL); + //pthread_exit(NULL); return 0; } @@ -770,6 +771,19 @@ void goto_set_num_threads(int num_threads) { if (num_threads < 1) num_threads = blas_num_threads; +#ifndef NO_AFFINITY + if (num_threads == 1) { + if (blas_cpu_number == 1){ + //OpenBLAS is already single thread. + return; + }else{ + //From multi-threads to single thread + //Restore the original affinity mask + gotoblas_set_affinity(-1); + } + } +#endif + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > blas_num_threads) { @@ -800,6 +814,13 @@ void goto_set_num_threads(int num_threads) { UNLOCK_COMMAND(&server_lock); } +#ifndef NO_AFFINITY + if(blas_cpu_number == 1 && num_threads > 1){ + //Restore the thread 0 affinity. + gotoblas_set_affinity(0); + } +#endif + blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index c45856fd9..c567ed688 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -49,8 +49,12 @@ int blas_server_avail = 0; +static void * blas_thread_buffer[MAX_CPU_NUMBER]; + void goto_set_num_threads(int num_threads) { + int i=0; + if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; omp_set_num_threads(blas_cpu_number); - + + //adjust buffer for each thread + for(i=0; i sa; sb = queue -> sb; @@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){ if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { - buffer = blas_memory_alloc(2); + pos = omp_get_thread_num(); + buffer = blas_thread_buffer[pos]; + + //fallback + if(buffer==NULL) { + buffer = blas_memory_alloc(2); + release_flag=1; + } if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); @@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } } @@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){ } - if (buffer != NULL) blas_memory_free(buffer); + if (release_flag) blas_memory_free(buffer); } diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 6708509e1..bd1069c5e 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -63,6 +63,8 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; + + static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ @@ -179,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ do { action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); - } while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); + } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); if (action == WAIT_OBJECT_0 + 1) break; @@ -251,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } #ifdef MONITOR @@ -263,7 +266,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else { legacy_exec(routine, queue -> mode, queue -> args, sb); } - } + }else{ + continue; //if queue == NULL + } #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); @@ -425,7 +430,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ /* Shutdown procedure, but user don't have to call this routine. The */ /* kernel automatically kill threads. */ -int blas_thread_shutdown_(void){ +int BLASFUNC(blas_thread_shutdown)(void){ int i; @@ -437,7 +442,7 @@ int blas_thread_shutdown_(void){ SetEvent(pool.killed); - for(i = 0; i < blas_cpu_number - 1; i++){ + for(i = 0; i < blas_num_threads - 1; i++){ WaitForSingleObject(blas_threads[i], INFINITE); } @@ -448,3 +453,47 @@ int blas_thread_shutdown_(void){ return 0; } + +void goto_set_num_threads(int num_threads) +{ + long i; + + if (num_threads < 1) num_threads = blas_cpu_number; + + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + + if (num_threads > blas_num_threads) { + + LOCK_COMMAND(&server_lock); + + //increased_threads = 1; + if (!blas_server_avail){ + + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + + pool.shutdown = 0; + pool.queue = NULL; + blas_server_avail = 1; + } + + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + + blas_threads[i] = CreateThread(NULL, 0, + blas_thread_server, (void *)i, + 0, &blas_threads_id[i]); + } + + blas_num_threads = num_threads; + + UNLOCK_COMMAND(&server_lock); + } + + blas_cpu_number = num_threads; +} + +void openblas_set_num_threads(int num) +{ + goto_set_num_threads(num); +} diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 0364d0374..893dd0738 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -60,6 +60,16 @@ extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_BARCELONA; +extern gotoblas_t gotoblas_BOBCAT; +#ifndef NO_AVX +extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BULLDOZER; +#else +//Use NEHALEM kernels for sandy bridge +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_BULLDOZER gotoblas_BARCELONA +#endif + #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -68,6 +78,32 @@ extern gotoblas_t gotoblas_BARCELONA; #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) +#ifndef NO_AVX +static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv + __asm__ __volatile__ + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} +#endif + +int support_avx(){ +#ifndef NO_AVX + int eax, ebx, ecx, edx; + int ret=0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 6) == 6){ + ret=1; //OS support AVX + } + } + return ret; +#else + return 0; +#endif +} + static int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; @@ -122,15 +158,39 @@ static gotoblas_t *get_coretype(void){ if (model == 12) return &gotoblas_ATOM; return NULL; - case 2: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - if (model == 5) return &gotoblas_NEHALEM; + case 2: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + if (model == 5) return &gotoblas_NEHALEM; - //Intel Xeon Processor 5600 (Westmere-EP) - if (model == 12) return &gotoblas_NEHALEM; - return NULL; + //Intel Xeon Processor 5600 (Westmere-EP) + //Xeon Processor E7 (Westmere-EX) + //Xeon E7540 + if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; + + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + //Intel Core i7-3000 / Xeon E5 + if (model == 10 || model == 13) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + case 3: + //Intel Sandy Bridge 22nm (Ivy Bridge?) + if (model == 10) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; } case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; @@ -144,7 +204,17 @@ static gotoblas_t *get_coretype(void){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; - } else { + } else if (exfamily == 5) { + return &gotoblas_BOBCAT; + } else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return &gotoblas_BULLDOZER; + else{ + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + } else { return &gotoblas_BARCELONA; } } @@ -178,6 +248,9 @@ static char *corename[] = { "Opteron(SSE3)", "Barcelona", "Nano", + "Sandybridge", + "Bobcat", + "Bulldozer", }; char *gotoblas_corename(void) { @@ -197,7 +270,10 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_OPTERON) return corename[13]; if (gotoblas == &gotoblas_BARCELONA) return corename[14]; if (gotoblas == &gotoblas_NANO) return corename[15]; - + if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; + if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; + return corename[0]; } @@ -211,12 +287,21 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ + if (sizeof(void*) == 8) { + if (gotoblas == &gotoblas_KATMAI || + gotoblas == &gotoblas_COPPERMINE || + gotoblas == &gotoblas_NORTHWOOD || + gotoblas == &gotoblas_BANIAS || + gotoblas == &gotoblas_ATHLON) + gotoblas = &gotoblas_PRESCOTT; + } #endif if (gotoblas && gotoblas -> init) { gotoblas -> init(); } else { - fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n"); + fprintf(stderr, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); exit(1); } diff --git a/driver/others/init.c b/driver/others/init.c index 4adba661f..f6924d5f4 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MAX_NODES 16 #define MAX_CPUS 256 +#define NCPUBITS (8*sizeof(unsigned long)) +#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) +#define CPUELT(cpu) ((cpu) / NCPUBITS) +#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) + #define SH_MAGIC 0x510510 @@ -103,10 +108,10 @@ typedef struct { int num_nodes; int num_procs; int final_num_procs; - unsigned long avail; - + unsigned long avail [MAX_BITMASK_LEN]; + int avail_count; unsigned long cpu_info [MAX_CPUS]; - unsigned long node_info [MAX_NODES]; + unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; int cpu_use[MAX_CPUS]; } shm_t; @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; static int shmid, pshmid; static void *paddr; -static unsigned long lprocmask, lnodemask; +static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; +static int lprocmask_count = 0; static int numprocs = 1; static int numnodes = 1; @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { than sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. ***/ -static inline unsigned long get_cpumap(int node) { +static inline void get_cpumap(int node, unsigned long * node_info) { int infile; - unsigned long affinity; + unsigned long affinity[32]; char name[160]; char cpumap[160]; - char *p, *dummy; + char *dummy; int i=0; + int count=0; + int k=0; sprintf(name, CPUMAP_NAME, node); infile = open(name, O_RDONLY); + for(i=0; i<32; i++){ + affinity[i] = 0; + } - affinity = 0; - if (infile != -1) { read(infile, cpumap, sizeof(cpumap)); - p = cpumap; - while (*p != '\n' && i<160){ - if(*p != ',') { - name[i++]=*p; + + for(i=0; i<160; i++){ + if(cpumap[i] == '\n') + break; + if(cpumap[i] != ','){ + name[k++]=cpumap[i]; + + //Enough data for Hex + if(k >= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } } - p++; + + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + } + + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i num_nodes = 0; @@ -258,7 +309,9 @@ static int numa_check(void) { return 0; } - for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; + for (node = 0; node < MAX_NODES; node ++) { + for (j = 0; j node_info[node][j] = 0; + } while ((dir = readdir(dp)) != NULL) { if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { @@ -266,12 +319,12 @@ static int numa_check(void) { node = atoi(&dir -> d_name[4]); if (node > MAX_NODES) { - fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); + fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); exit(1); } common -> num_nodes ++; - common -> node_info[node] = get_cpumap(node); + get_cpumap(node, common->node_info[node]); } } @@ -284,7 +337,7 @@ static int numa_check(void) { fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); for (node = 0; node < common -> num_nodes; node ++) - fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); + fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); #endif return common -> num_nodes; @@ -296,11 +349,13 @@ static void numa_mapping(void) { int i, j, h; unsigned long work, bit; int count = 0; + int bitmask_idx = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; for (cpu = 0; cpu < common -> num_procs; cpu ++) { - if (common -> node_info[node] & common -> avail & (1UL << cpu)) { + bitmask_idx = CPUELT(cpu); + if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); count ++; core ++; @@ -357,58 +412,92 @@ static void numa_mapping(void) { static void disable_hyperthread(void) { - unsigned long share; + unsigned long share[MAX_BITMASK_LEN]; int cpu; + int bitmask_idx = 0; + int i=0, count=0; + bitmask_idx = CPUELT(common -> num_procs); - if(common->num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); - exit(1); - }else if(common->num_procs == 64){ - common -> avail = 0xFFFFFFFFFFFFFFFFUL; - }else - common -> avail = (1UL << common -> num_procs) - 1; + for(i=0; i< bitmask_idx; i++){ + common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> num_procs) != 1){ + common -> avail[count++] = CPUMASK(common -> num_procs) - 1; + } + common -> avail_count = count; + + /* if(common->num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ + /* exit(1); */ + /* }else if(common->num_procs == 64){ */ + /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* common -> avail = (1UL << common -> num_procs) - 1; */ #ifdef DEBUG - fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); + fprintf(stderr, "\nAvail CPUs : "); + for(i=0; i avail[i]); + fprintf(stderr, ".\n"); #endif for (cpu = 0; cpu < common -> num_procs; cpu ++) { - - share = (get_share(cpu, 1) & common -> avail); - - if (popcount(share) > 1) { + + get_share(cpu, 1, share); + + //When the shared cpu are in different element of share & avail array, this may be a bug. + for (i = 0; i < count ; i++){ + + share[i] &= common->avail[i]; + + if (popcount(share[i]) > 1) { #ifdef DEBUG - fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", - cpu, share & ~(1UL << cpu)); + fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", + cpu, share[i] & ~(CPUMASK(cpu))); #endif - common -> avail &= ~((share & ~(1UL << cpu))); + common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); + } } } } static void disable_affinity(void) { - + int i=0; + int bitmask_idx=0; + int count=0; #ifdef DEBUG - fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); + fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); #endif - if(common->final_num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); - exit(1); - }else if(common->final_num_procs == 64){ - lprocmask = 0xFFFFFFFFFFFFFFFFUL; - }else - lprocmask = (1UL << common -> final_num_procs) - 1; + /* if(common->final_num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ + /* exit(1); */ + /* }else if(common->final_num_procs == 64){ */ + /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* lprocmask = (1UL << common -> final_num_procs) - 1; */ + + bitmask_idx = CPUELT(common -> final_num_procs); + + for(i=0; i< bitmask_idx; i++){ + lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> final_num_procs) != 1){ + lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; + } + lprocmask_count = count; #ifndef USE_OPENMP - lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; + for(i=0; i< count; i++){ + lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; + } #endif #ifdef DEBUG - fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); + fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); #endif } @@ -498,7 +587,7 @@ static void create_pshmem(void) { static void local_cpu_map(void) { int cpu, id, mapping; - + int bitmask_idx = 0; cpu = 0; mapping = 0; @@ -508,8 +597,9 @@ static void local_cpu_map(void) { if (id > 0) { if (is_dead(id)) common -> cpu_use[cpu] = 0; } - - if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { + + bitmask_idx = CPUELT(cpu); + if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { common -> cpu_use[cpu] = pshmid; cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); @@ -595,6 +685,7 @@ void gotoblas_affinity_init(void) { #ifndef USE_OPENMP cpu_set_t cpu_mask; #endif + int i; if (initialized) return; @@ -646,6 +737,11 @@ void gotoblas_affinity_init(void) { common -> num_procs = get_nprocs(); + if(common -> num_procs > MAX_CPUS) { + fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); + exit(1); + } + for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; numa_check(); @@ -654,7 +750,8 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); - common -> final_num_procs = popcount(common -> avail); + common -> final_num_procs = 0; + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; @@ -664,7 +761,8 @@ void gotoblas_affinity_init(void) { disable_affinity(); - num_avail = popcount(lprocmask); + num_avail = 0; + for(i=0; i num_avail)) numprocs = num_avail; diff --git a/driver/others/memory.c b/driver/others/memory.c index 3f1a5f60a..2070adf5d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) #include #endif @@ -185,7 +185,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) int get_num_procs(void) { @@ -206,7 +206,27 @@ int get_num_procs(void) { #endif +#if defined(OS_DARWIN) +int get_num_procs(void) { + static int nums = 0; + size_t len; + if (nums == 0){ + len = sizeof(int); + sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0); + } + return nums; +} +#endif + +/* +OpenBLAS uses the numbers of CPU cores in multithreading. +It can be set by openblas_set_num_threads(int num_threads); +*/ int blas_cpu_number = 0; +/* +The numbers of threads in the thread pool. +This value is equal or large than blas_cpu_number. This means some threads are sleep. +*/ int blas_num_threads = 0; int goto_get_num_procs (void) { @@ -215,7 +235,7 @@ int goto_get_num_procs (void) { int blas_get_cpu_number(void){ char *p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) int max_num; #endif int blas_goto_num = 0; @@ -223,7 +243,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) max_num = get_num_procs(); #endif @@ -250,7 +270,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1128,7 +1148,7 @@ static BLASULONG init_lock = 0UL; static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, void *sa, void *sb, BLASLONG pos) { -#ifndef ARCH_POWER +#if !defined(ARCH_POWER) && !defined(ARCH_SPARC) long size; BLASULONG buffer; @@ -1289,6 +1309,7 @@ void DESTRUCTOR gotoblas_quit(void) { moncontrol (1); #endif + blas_shutdown(); } #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c new file mode 100644 index 000000000..581ab1a43 --- /dev/null +++ b/driver/others/openblas_get_config.c @@ -0,0 +1,59 @@ +/***************************************************************************** +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +static char* openblas_config_str="" +#ifdef USE64BITINT + "USE64BITINT " +#endif +#ifdef NO_CBLAS + "NO_CBLAS " +#endif +#ifdef NO_LAPACK + "NO_LAPACK " +#endif +#ifdef NO_LAPACKE + "NO_LAPACKE " +#endif +#ifdef DYNAMIC_ARCH + "DYNAMIC_ARCH " +#endif +#ifdef NO_AFFINITY + "NO_AFFINITY " +#endif + ; + +char* CNAME() { + return openblas_config_str; +} + diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 7ca3b7114..5e24cfcc7 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,13 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef SMP_SERVER -#ifdef OS_LINUX extern void openblas_set_num_threads(int num_threads) ; -void NAME(int* num_threads){ +void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } -#endif +#else +//Single thread + +void openblas_set_num_threads(int num_threads) { +} + +void openblas_set_num_threads_(int* num_threads){ + +} #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 21f56e889..58e5fb11d 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,9 +163,9 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -384,6 +384,17 @@ void blas_set_parameter(void){ #endif #endif +#if defined(SANDYBRIDGE) + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#endif + #if defined(CORE_PRESCOTT) || defined(GENERIC) size >>= 6; @@ -435,7 +446,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_BARCELONA) +#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) size >>= 8; sgemm_p = 232 * size; diff --git a/exports/Makefile b/exports/Makefile index 873e8b270..15041be86 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -10,10 +10,23 @@ ifndef NO_CBLAS NO_CBLAS = 0 endif +ifndef NO_LAPACK +NO_LAPACK = 0 +endif + +ifndef NO_LAPACKE +NO_LAPACKE = 0 +endif + ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) EXTRALIB += -lgfortran endif +ifeq ($(USE_OPENMP), 1) +ifeq ($(C_COMPILER), GCC) +EXTRALIB += -lgomp +endif +endif endif ifeq ($(OSNAME), CYGWIN_NT) @@ -58,15 +71,20 @@ dll : ../$(LIBDLLNAME) dll2 : libgoto2_shared.dll +# On Windows, we only generate a DLL without a version suffix. This is because +# applications which link against the dynamic library reference a fixed DLL name +# in their import table. By instead using a stable name it is possible to +# upgrade between library versions, without needing to re-link an application. +# For more details see: https://github.com/xianyi/OpenBLAS/issues/127. ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:i386 /def:libopenblas.def else $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:X64 /def:libopenblas.def endif @@ -76,13 +94,13 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) libopenblas.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) libgoto2_shared.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) $(LIBDYNNAME) : ../$(LIBNAME) osx.def $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) @@ -106,14 +124,15 @@ so : ../$(LIBSONAME) endif -ifeq ($(OSNAME), FreeBSD) +#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest @@ -163,23 +182,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) linux.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) symbol.S : gensymbol - perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > symbol.S + perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S test : linktest.c $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* diff --git a/exports/gensymbol b/exports/gensymbol index 6b2a00672..99609b356 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -72,7 +72,17 @@ zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, ); +@misc_no_underscore_objs = ( + openblas_set_num_threads, goto_set_num_threads, + openblas_get_config, + ); + +@misc_underscore_objs = ( + openblas_set_num_threads, + ); + @lapackobjs = ( + # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, sgetf2, dgetf2, cgetf2, zgetf2, sgetrf, dgetrf, cgetrf, zgetrf, @@ -88,17 +98,72 @@ ); @lapackobjs2 = ( - sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, + # These routines are provided by LAPACK (reference implementation). + # + # This list is prepared by copying all routines listed in + # `lapack-3.4.1/SRC/Makefile` and replacing the '.o' suffix with a comma. + # Thereafter the following routines should be removed: + # - those provided by OpenBLAS (see @lapackobjs) + # - extra precision routines (see @lapack_extendedprecision_objs) + # Each of these have been marked individually with "already provided" or "excluded". + + # ALLAUX -- Auxiliary routines called from all precisions + # already provided by @blasobjs: xerbla, lsame + ilaenv, ieeeck, lsamen, xerbla_array, iparmq, + ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, + ilaver, slamch, slamc3, + + # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. + # excluded: second_$(TIMER) + sbdsdc, + sbdsqr, sdisna, slabad, slacpy, sladiv, slae2, slaebz, + slaed0, slaed1, slaed2, slaed3, slaed4, slaed5, slaed6, + slaed7, slaed8, slaed9, slaeda, slaev2, slagtf, + slagts, slamrg, slanst, + slapy2, slapy3, slarnv, + slarra, slarrb, slarrc, slarrd, slarre, slarrf, slarrj, + slarrk, slarrr, slaneg, + slartg, slaruv, slas2, slascl, + slasd0, slasd1, slasd2, slasd3, slasd4, slasd5, slasd6, + slasd7, slasd8, slasda, slasdq, slasdt, + slaset, slasq1, slasq2, slasq3, slasq4, slasq5, slasq6, + slasr, slasrt, slassq, slasv2, spttrf, sstebz, sstedc, + ssteqr, ssterf, slaisnan, sisnan, + slartgp, slartgs, + + # DZLAUX -- Auxiliary routines called from both DOUBLE and COMPLEX*16. + # excluded: dsecnd_$(TIMER) + dbdsdc, + dbdsqr, ddisna, dlabad, dlacpy, dladiv, dlae2, dlaebz, + dlaed0, dlaed1, dlaed2, dlaed3, dlaed4, dlaed5, dlaed6, + dlaed7, dlaed8, dlaed9, dlaeda, dlaev2, dlagtf, + dlagts, dlamrg, dlanst, + dlapy2, dlapy3, dlarnv, + dlarra, dlarrb, dlarrc, dlarrd, dlarre, dlarrf, dlarrj, + dlarrk, dlarrr, dlaneg, + dlartg, dlaruv, dlas2, dlascl, + dlasd0, dlasd1, dlasd2, dlasd3, dlasd4, dlasd5, dlasd6, + dlasd7, dlasd8, dlasda, dlasdq, dlasdt, + dlaset, dlasq1, dlasq2, dlasq3, dlasq4, dlasq5, dlasq6, + dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, + dsteqr, dsterf, dlaisnan, disnan, + dlartgp, dlartgs, + dlamch, dlamc3, + + # SLASRC -- Single precision real LAPACK routines + # already provided by @lapackobjs: + # sgesv, sgetf2, slaswp, slauu2, slauum, spotf2, spotri, strti2, strtri + sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, - sgeqp3, sgeqpf, sgeqr2, sgeqrf, sgerfs, sgerq2, sgerqf, - sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, - sgetri, + sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, + sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, + sgetc2, sgetri, sggbak, sggbal, sgges, sggesx, sggev, sggevx, sggglm, sgghrd, sgglse, sggqrf, - sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, + sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, shsein, shseqr, slabrd, slacon, slacn2, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, @@ -108,8 +173,8 @@ slapll, slapmt, slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, - slaqtr, slar1v, slar2v, - slarf, slarfb, slarfg, slarft, slarfx, slargv, + slaqtr, slar1v, slar2v, ilaslr, ilaslc, + slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, slarrv, slartv, slarz, slarzb, slarzt, slasy2, slasyf, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, @@ -119,41 +184,65 @@ sormbr, sormhr, sorml2, sormlq, sormql, sormqr, sormr2, sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, spbstf, spbsv, spbsvx, - spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, - sposvx, spotrs, sppcon, sppequ, + spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, + sposvx, spstrf, spstf2, + sppcon, sppequ, spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, - spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, + spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, ssbtrd, sspcon, sspev, sspevd, sspevx, sspgst, sspgv, sspgvd, sspgvx, ssprfs, sspsv, sspsvx, ssptrd, ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, - sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, + sstevx, + ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, - ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytrs, stbcon, + ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytri2, ssytri2x, + ssyswapr, ssytrs, ssytrs2, ssyconv, + stbcon, stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, stptrs, strcon, strevc, strexc, strrfs, strsen, strsna, strsyl, strtrs, stzrqf, stzrzf, sstemr, - + slansf, spftrf, spftri, spftrs, ssfrk, stfsm, stftri, stfttp, + stfttr, stpttf, stpttr, strttf, strttp, + sgejsv, sgesvj, sgsvj0, sgsvj1, + sgeequb, ssyequb, spoequb, sgbequb, + sbbcsd, slapmr, sorbdb, sorcsd, + sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, + stpqrt, stpqrt2, stpmqrt, stprfb, + + # DSLASRC -- Double-single mixed precision real routines called from + # single, single-extra and double precision real LAPACK + # routines (i.e. from SLASRC, SXLASRC, DLASRC). + # + # already provided by @lapackobjs: + # sgetrs, spotrf, sgetrf + spotrs, + + # CLASRC -- Single precision complex LAPACK routines + # already provided by @blasobjs: csymv + # already provided by @lapackobjs: + # cgesv, cgetf2, claswp, clauu2, clauum, cpotf2, cpotri, ctrti2, ctrtri cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, - cgeqpf, cgeqr2, cgeqrf, cgerfs, cgerq2, cgerqf, - cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, - cgetri, + cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, + cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, + cgesvx, cgetc2, cgetri, cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, cgghrd, cgglse, cggqrf, cggrqf, cggsvd, cggsvp, - cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, + cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, chetf2, chetrd, - chetrf, chetri, chetrs, chgeqz, chpcon, chpev, chpevd, - chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, + chetrf, chetri, chetri2, chetri2x, cheswapr, + chetrs, chetrs2, chgeqz, chpcon, chpev, chpevd, + chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, chpsvx, chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, clacgv, clacon, clacn2, clacp2, clacpy, clacrm, clacrt, cladiv, @@ -166,20 +255,23 @@ clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge, claqhb, claqhe, claqhp, claqp2, claqps, claqsb, claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, - claqsp, claqsy, clar1v, clar2v, clarf, clarfb, clarfg, clarft, + claqsp, claqsy, clar1v, clar2v, ilaclr, ilaclc, + clarf, clarfb, clarfg, clarft, clarfgp, clarfx, clargv, clarnv, clarrv, clartg, clartv, clarz, clarzb, clarzt, clascl, claset, clasr, classq, clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, - clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, + clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, - cposv, cposvx, cpotrs, cppcon, - cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, + cposv, cposvx, cpstrf, cpstf2, + cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, - crot, cspcon, cspmv, cspr, csprfs, cspsv, + crot, cspcon, cspmv, cspr, csprfs, cspsv, cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, - cstegr, cstein, csteqr, csycon, - csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, - csytrs, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, + cstegr, cstein, csteqr, + csycon, + csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, csytri2, csytri2x, + csyswapr, csytrs, csytrs2, csyconv, + ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, ctprfs, ctptri, ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, @@ -188,18 +280,36 @@ cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2, cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz, cunmtr, cupgtr, cupmtr, icmax1, scsum1, cstemr, - - dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, + chfrk, ctfttp, clanhf, cpftrf, cpftri, cpftrs, ctfsm, ctftri, + ctfttr, ctpttf, ctpttr, ctrttf, ctrttp, + cgeequb, cgbequb, csyequb, cpoequb, cheequb, + cbbcsd, clapmr, cunbdb, cuncsd, + cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, + ctpqrt, ctpqrt2, ctpmqrt, ctprfb, + + # ZCLASRC -- Double-single mixed precision complex routines called from + # single, single-extra and double precision complex LAPACK + # routines (i.e. from CLASRC, CXLASRC, ZLASRC). + # + # already provided by @lapackobjs: + # cgetrs, cpotrf, cgetrf + cpotrs, + + # DLASRC -- Double precision real LAPACK routines + # already provided by @lapackobjs: + # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, + # dtrti2, dtrtri + dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, - dgeqp3, dgeqpf, dgeqr2, dgeqrf, dgerfs, dgerq2, dgerqf, - dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, - dgetri, + dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, + dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, + dgetc2, dgetri, dggbak, dggbal, dgges, dggesx, dggev, dggevx, dggglm, dgghrd, dgglse, dggqrf, - dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, + dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, dhsein, dhseqr, dlabrd, dlacon, dlacn2, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, @@ -209,9 +319,9 @@ dlapll, dlapmt, dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, - dlaqtr, dlar1v, dlar2v, - dlarf, dlarfb, dlarfg, dlarft, dlarfx, dlargv, - dlarrv, dlartv, + dlaqtr, dlar1v, dlar2v, iladlr, iladlc, + dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, + dlargv, dlarrv, dlartv, dlarz, dlarzb, dlarzt, dlasy2, dlasyf, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, dopgtr, dopmtr, dorg2l, dorg2r, @@ -220,43 +330,59 @@ dormbr, dormhr, dorml2, dormlq, dormql, dormqr, dormr2, dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, dpbstf, dpbsv, dpbsvx, - dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, - dposvx, dpotrs, dppcon, dppequ, + dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, + dposvx, dpotrs, dpstrf, dpstf2, + dppcon, dppequ, dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, - dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, + dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, dsbtrd, dspcon, dspev, dspevd, dspevx, dspgst, dspgv, dspgvd, dspgvx, dsprfs, dspsv, dspsvx, dsptrd, dsptrf, dsptri, dsptrs, dstegr, dstein, dstev, dstevd, dstevr, - dstevx, dsycon, dsyev, dsyevd, dsyevr, + dstevx, + dsycon, dsyev, dsyevd, dsyevr, dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, dsysv, dsysvx, - dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dtbcon, - dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, + dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytri2, dsytri2x, + dsyswapr, dsytrs, dsytrs2, dsyconv, + dtbcon, dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, dtptrs, dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, dtrtrs, dtzrqf, dtzrzf, dstemr, - dsgesv, dlag2s, slag2d, - + dsgesv, dsposv, dlag2s, slag2d, dlat2s, + dlansf, dpftrf, dpftri, dpftrs, dsfrk, dtfsm, dtftri, dtfttp, + dtfttr, dtpttf, dtpttr, dtrttf, dtrttp, + dgejsv, dgesvj, dgsvj0, dgsvj1, + dgeequb, dsyequb, dpoequb, dgbequb, + dbbcsd, dlapmr, dorbdb, dorcsd, + dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, + dtpqrt, dtpqrt2, dtpmqrt, dtprfb, + + # ZLASRC -- Double precision complex LAPACK routines + # already provided by @blasobjs: zsymv + # already provided by @lapackobjs: + # zgesv, zgetrs, zgetf2, zlaswp, zlauu2, zlauum, zpotf2, zpotrf, zpotri, + # ztrti2, ztrtri zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf, zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3, - zgeqpf, zgeqr2, zgeqrf, zgerfs, zgerq2, zgerqf, + zgeqpf, zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf, zgesc2, zgesdd, zgesvd, zgesvx, zgetc2, zgetri, zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, zgghrd, zgglse, zggqrf, zggrqf, zggsvd, zggsvp, - zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, + zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, zhetf2, zhetrd, - zhetrf, zhetri, zhetrs, zhgeqz, zhpcon, zhpev, zhpevd, - zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, + zhetrf, zhetri, zhetri2, zhetri2x, zheswapr, + zhetrs, zhetrs2, zhgeqz, zhpcon, zhpev, zhpevd, + zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, zhpsvx, zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, zlacgv, zlacon, zlacn2, zlacp2, zlacpy, zlacrm, zlacrt, zladiv, @@ -270,22 +396,25 @@ zlantp, zlantr, zlapll, zlapmt, zlaqgb, zlaqge, zlaqhb, zlaqhe, zlaqhp, zlaqp2, zlaqps, zlaqsb, zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, - zlaqsp, zlaqsy, zlar1v, zlar2v, zlarcm, zlarf, zlarfb, - zlarfg, zlarft, + zlaqsp, zlaqsy, zlar1v, zlar2v, ilazlr, ilazlc, + zlarcm, zlarf, zlarfb, + zlarfg, zlarft, zlarfgp, zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, - zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, + zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, zlassq, zlasyf, zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, - zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, + zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, - zposv, zposvx, zpotrs, zppcon, - zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, + zposv, zposvx, zpotrs, zpstrf, zpstf2, + zppcon, zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, - zrot, zspcon, zspmv, zspr, zsprfs, zspsv, + zrot, zspcon, zspmv, zspr, zsprfs, zspsv, zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, - zstegr, zstein, zsteqr, zsycon, - zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, - zsytrs, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, + zstegr, zstein, zsteqr, + zsycon, + zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, zsytri2, zsytri2x, + zsyswapr, zsytrs, zsytrs2, zsyconv, + ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, ztprfs, ztptri, ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, @@ -295,27 +424,2281 @@ zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz, zunmtr, zupgtr, zupmtr, izmax1, dzsum1, zstemr, - zcgesv, zlag2c, clag2z, + zcgesv, zcposv, zlag2c, clag2z, zlat2c, + zhfrk, ztfttp, zlanhf, zpftrf, zpftri, zpftrs, ztfsm, ztftri, + ztfttr, ztpttf, ztpttr, ztrttf, ztrttp, + zgeequb, zgbequb, zsyequb, zpoequb, zheequb, + zbbcsd, zlapmr, zunbdb, zuncsd, + zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, + ztpqrt, ztpqrt2, ztpmqrt, ztprfb, + ); + +@lapack_extendedprecision_objs = ( + zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, + dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, + ); + +@lapackeobjs = ( + # LAPACK C interface routines. + # + # This list is prepared in a similar manner to @lapackobjs2, however the + # functions all begin with an uppercase prefix (with the exception of the + # make_complex_* routines). + # + # The functions corresponding to @(MATGEN_OBJ) and @(SRCX_OBJ) are not + # exported since the respective LAPACK routines are not built by default. + + # @(OBJ) from `lapack-3.4.1/lapacke/utils/Makefile` + LAPACKE_cgb_nancheck, + LAPACKE_cgb_trans, + LAPACKE_cge_nancheck, + LAPACKE_cge_trans, + LAPACKE_cgg_nancheck, + LAPACKE_cgg_trans, + LAPACKE_cgt_nancheck, + LAPACKE_chb_nancheck, + LAPACKE_chb_trans, + LAPACKE_che_nancheck, + LAPACKE_che_trans, + LAPACKE_chp_nancheck, + LAPACKE_chp_trans, + LAPACKE_chs_nancheck, + LAPACKE_chs_trans, + LAPACKE_c_nancheck, + LAPACKE_cpb_nancheck, + LAPACKE_cpb_trans, + LAPACKE_cpf_nancheck, + LAPACKE_cpf_trans, + LAPACKE_cpo_nancheck, + LAPACKE_cpo_trans, + LAPACKE_cpp_nancheck, + LAPACKE_cpp_trans, + LAPACKE_cpt_nancheck, + LAPACKE_csp_nancheck, + LAPACKE_csp_trans, + LAPACKE_cst_nancheck, + LAPACKE_csy_nancheck, + LAPACKE_csy_trans, + LAPACKE_ctb_nancheck, + LAPACKE_ctb_trans, + LAPACKE_ctf_nancheck, + LAPACKE_ctf_trans, + LAPACKE_ctp_nancheck, + LAPACKE_ctp_trans, + LAPACKE_ctr_nancheck, + LAPACKE_ctr_trans, + LAPACKE_dgb_nancheck, + LAPACKE_dgb_trans, + LAPACKE_dge_nancheck, + LAPACKE_dge_trans, + LAPACKE_dgg_nancheck, + LAPACKE_dgg_trans, + LAPACKE_dgt_nancheck, + LAPACKE_dhs_nancheck, + LAPACKE_dhs_trans, + LAPACKE_d_nancheck, + LAPACKE_dpb_nancheck, + LAPACKE_dpb_trans, + LAPACKE_dpf_nancheck, + LAPACKE_dpf_trans, + LAPACKE_dpo_nancheck, + LAPACKE_dpo_trans, + LAPACKE_dpp_nancheck, + LAPACKE_dpp_trans, + LAPACKE_dpt_nancheck, + LAPACKE_dsb_nancheck, + LAPACKE_dsb_trans, + LAPACKE_dsp_nancheck, + LAPACKE_dsp_trans, + LAPACKE_dst_nancheck, + LAPACKE_dsy_nancheck, + LAPACKE_dsy_trans, + LAPACKE_dtb_nancheck, + LAPACKE_dtb_trans, + LAPACKE_dtf_nancheck, + LAPACKE_dtf_trans, + LAPACKE_dtp_nancheck, + LAPACKE_dtp_trans, + LAPACKE_dtr_nancheck, + LAPACKE_dtr_trans, + LAPACKE_lsame, + LAPACKE_sgb_nancheck, + LAPACKE_sgb_trans, + LAPACKE_sge_nancheck, + LAPACKE_sge_trans, + LAPACKE_sgg_nancheck, + LAPACKE_sgg_trans, + LAPACKE_sgt_nancheck, + LAPACKE_shs_nancheck, + LAPACKE_shs_trans, + LAPACKE_s_nancheck, + LAPACKE_spb_nancheck, + LAPACKE_spb_trans, + LAPACKE_spf_nancheck, + LAPACKE_spf_trans, + LAPACKE_spo_nancheck, + LAPACKE_spo_trans, + LAPACKE_spp_nancheck, + LAPACKE_spp_trans, + LAPACKE_spt_nancheck, + LAPACKE_ssb_nancheck, + LAPACKE_ssb_trans, + LAPACKE_ssp_nancheck, + LAPACKE_ssp_trans, + LAPACKE_sst_nancheck, + LAPACKE_ssy_nancheck, + LAPACKE_ssy_trans, + LAPACKE_stb_nancheck, + LAPACKE_stb_trans, + LAPACKE_stf_nancheck, + LAPACKE_stf_trans, + LAPACKE_stp_nancheck, + LAPACKE_stp_trans, + LAPACKE_str_nancheck, + LAPACKE_str_trans, + LAPACKE_xerbla, + LAPACKE_zgb_nancheck, + LAPACKE_zgb_trans, + LAPACKE_zge_nancheck, + LAPACKE_zge_trans, + LAPACKE_zgg_nancheck, + LAPACKE_zgg_trans, + LAPACKE_zgt_nancheck, + LAPACKE_zhb_nancheck, + LAPACKE_zhb_trans, + LAPACKE_zhe_nancheck, + LAPACKE_zhe_trans, + LAPACKE_zhp_nancheck, + LAPACKE_zhp_trans, + LAPACKE_zhs_nancheck, + LAPACKE_zhs_trans, + LAPACKE_z_nancheck, + LAPACKE_zpb_nancheck, + LAPACKE_zpb_trans, + LAPACKE_zpf_nancheck, + LAPACKE_zpf_trans, + LAPACKE_zpo_nancheck, + LAPACKE_zpo_trans, + LAPACKE_zpp_nancheck, + LAPACKE_zpp_trans, + LAPACKE_zpt_nancheck, + LAPACKE_zsp_nancheck, + LAPACKE_zsp_trans, + LAPACKE_zst_nancheck, + LAPACKE_zsy_nancheck, + LAPACKE_zsy_trans, + LAPACKE_ztb_nancheck, + LAPACKE_ztb_trans, + LAPACKE_ztf_nancheck, + LAPACKE_ztf_trans, + LAPACKE_ztp_nancheck, + LAPACKE_ztp_trans, + LAPACKE_ztr_nancheck, + LAPACKE_ztr_trans, + lapack_make_complex_float, + lapack_make_complex_double, + + # @(SRC_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + LAPACKE_cbbcsd, + LAPACKE_cbbcsd_work, + LAPACKE_cbdsqr, + LAPACKE_cbdsqr_work, + LAPACKE_cgbbrd, + LAPACKE_cgbbrd_work, + LAPACKE_cgbcon, + LAPACKE_cgbcon_work, + LAPACKE_cgbequ, + LAPACKE_cgbequ_work, + LAPACKE_cgbequb, + LAPACKE_cgbequb_work, + LAPACKE_cgbrfs, + LAPACKE_cgbrfs_work, + LAPACKE_cgbsv, + LAPACKE_cgbsv_work, + LAPACKE_cgbsvx, + LAPACKE_cgbsvx_work, + LAPACKE_cgbtrf, + LAPACKE_cgbtrf_work, + LAPACKE_cgbtrs, + LAPACKE_cgbtrs_work, + LAPACKE_cgebak, + LAPACKE_cgebak_work, + LAPACKE_cgebal, + LAPACKE_cgebal_work, + LAPACKE_cgebrd, + LAPACKE_cgebrd_work, + LAPACKE_cgecon, + LAPACKE_cgecon_work, + LAPACKE_cgeequ, + LAPACKE_cgeequ_work, + LAPACKE_cgeequb, + LAPACKE_cgeequb_work, + LAPACKE_cgees, + LAPACKE_cgees_work, + LAPACKE_cgeesx, + LAPACKE_cgeesx_work, + LAPACKE_cgeev, + LAPACKE_cgeev_work, + LAPACKE_cgeevx, + LAPACKE_cgeevx_work, + LAPACKE_cgehrd, + LAPACKE_cgehrd_work, + LAPACKE_cgelq2, + LAPACKE_cgelq2_work, + LAPACKE_cgelqf, + LAPACKE_cgelqf_work, + LAPACKE_cgels, + LAPACKE_cgels_work, + LAPACKE_cgelsd, + LAPACKE_cgelsd_work, + LAPACKE_cgelss, + LAPACKE_cgelss_work, + LAPACKE_cgelsy, + LAPACKE_cgelsy_work, + LAPACKE_cgemqrt, + LAPACKE_cgemqrt_work, + LAPACKE_cgeqlf, + LAPACKE_cgeqlf_work, + LAPACKE_cgeqp3, + LAPACKE_cgeqp3_work, + LAPACKE_cgeqpf, + LAPACKE_cgeqpf_work, + LAPACKE_cgeqr2, + LAPACKE_cgeqr2_work, + LAPACKE_cgeqrf, + LAPACKE_cgeqrf_work, + LAPACKE_cgeqrfp, + LAPACKE_cgeqrfp_work, + LAPACKE_cgeqrt, + LAPACKE_cgeqrt2, + LAPACKE_cgeqrt2_work, + LAPACKE_cgeqrt3, + LAPACKE_cgeqrt3_work, + LAPACKE_cgeqrt_work, + LAPACKE_cgerfs, + LAPACKE_cgerfs_work, + LAPACKE_cgerqf, + LAPACKE_cgerqf_work, + LAPACKE_cgesdd, + LAPACKE_cgesdd_work, + LAPACKE_cgesv, + LAPACKE_cgesv_work, + LAPACKE_cgesvd, + LAPACKE_cgesvd_work, + LAPACKE_cgesvx, + LAPACKE_cgesvx_work, + LAPACKE_cgetf2, + LAPACKE_cgetf2_work, + LAPACKE_cgetrf, + LAPACKE_cgetrf_work, + LAPACKE_cgetri, + LAPACKE_cgetri_work, + LAPACKE_cgetrs, + LAPACKE_cgetrs_work, + LAPACKE_cggbak, + LAPACKE_cggbak_work, + LAPACKE_cggbal, + LAPACKE_cggbal_work, + LAPACKE_cgges, + LAPACKE_cgges_work, + LAPACKE_cggesx, + LAPACKE_cggesx_work, + LAPACKE_cggev, + LAPACKE_cggev_work, + LAPACKE_cggevx, + LAPACKE_cggevx_work, + LAPACKE_cggglm, + LAPACKE_cggglm_work, + LAPACKE_cgghrd, + LAPACKE_cgghrd_work, + LAPACKE_cgglse, + LAPACKE_cgglse_work, + LAPACKE_cggqrf, + LAPACKE_cggqrf_work, + LAPACKE_cggrqf, + LAPACKE_cggrqf_work, + LAPACKE_cggsvd, + LAPACKE_cggsvd_work, + LAPACKE_cggsvp, + LAPACKE_cggsvp_work, + LAPACKE_cgtcon, + LAPACKE_cgtcon_work, + LAPACKE_cgtrfs, + LAPACKE_cgtrfs_work, + LAPACKE_cgtsv, + LAPACKE_cgtsv_work, + LAPACKE_cgtsvx, + LAPACKE_cgtsvx_work, + LAPACKE_cgttrf, + LAPACKE_cgttrf_work, + LAPACKE_cgttrs, + LAPACKE_cgttrs_work, + LAPACKE_chbev, + LAPACKE_chbev_work, + LAPACKE_chbevd, + LAPACKE_chbevd_work, + LAPACKE_chbevx, + LAPACKE_chbevx_work, + LAPACKE_chbgst, + LAPACKE_chbgst_work, + LAPACKE_chbgv, + LAPACKE_chbgv_work, + LAPACKE_chbgvd, + LAPACKE_chbgvd_work, + LAPACKE_chbgvx, + LAPACKE_chbgvx_work, + LAPACKE_chbtrd, + LAPACKE_chbtrd_work, + LAPACKE_checon, + LAPACKE_checon_work, + LAPACKE_cheequb, + LAPACKE_cheequb_work, + LAPACKE_cheev, + LAPACKE_cheev_work, + LAPACKE_cheevd, + LAPACKE_cheevd_work, + LAPACKE_cheevr, + LAPACKE_cheevr_work, + LAPACKE_cheevx, + LAPACKE_cheevx_work, + LAPACKE_chegst, + LAPACKE_chegst_work, + LAPACKE_chegv, + LAPACKE_chegv_work, + LAPACKE_chegvd, + LAPACKE_chegvd_work, + LAPACKE_chegvx, + LAPACKE_chegvx_work, + LAPACKE_cherfs, + LAPACKE_cherfs_work, + LAPACKE_chesv, + LAPACKE_chesv_work, + LAPACKE_chesvx, + LAPACKE_chesvx_work, + LAPACKE_cheswapr, + LAPACKE_cheswapr_work, + LAPACKE_chetrd, + LAPACKE_chetrd_work, + LAPACKE_chetrf, + LAPACKE_chetrf_work, + LAPACKE_chetri, + LAPACKE_chetri2, + LAPACKE_chetri2_work, + LAPACKE_chetri2x, + LAPACKE_chetri2x_work, + LAPACKE_chetri_work, + LAPACKE_chetrs, + LAPACKE_chetrs2, + LAPACKE_chetrs2_work, + LAPACKE_chetrs_work, + LAPACKE_chfrk, + LAPACKE_chfrk_work, + LAPACKE_chgeqz, + LAPACKE_chgeqz_work, + LAPACKE_chpcon, + LAPACKE_chpcon_work, + LAPACKE_chpev, + LAPACKE_chpev_work, + LAPACKE_chpevd, + LAPACKE_chpevd_work, + LAPACKE_chpevx, + LAPACKE_chpevx_work, + LAPACKE_chpgst, + LAPACKE_chpgst_work, + LAPACKE_chpgv, + LAPACKE_chpgv_work, + LAPACKE_chpgvd, + LAPACKE_chpgvd_work, + LAPACKE_chpgvx, + LAPACKE_chpgvx_work, + LAPACKE_chprfs, + LAPACKE_chprfs_work, + LAPACKE_chpsv, + LAPACKE_chpsv_work, + LAPACKE_chpsvx, + LAPACKE_chpsvx_work, + LAPACKE_chptrd, + LAPACKE_chptrd_work, + LAPACKE_chptrf, + LAPACKE_chptrf_work, + LAPACKE_chptri, + LAPACKE_chptri_work, + LAPACKE_chptrs, + LAPACKE_chptrs_work, + LAPACKE_chsein, + LAPACKE_chsein_work, + LAPACKE_chseqr, + LAPACKE_chseqr_work, + LAPACKE_clacgv, + LAPACKE_clacgv_work, + LAPACKE_clacpy, + LAPACKE_clacpy_work, + LAPACKE_clag2z, + LAPACKE_clag2z_work, + LAPACKE_clange, + LAPACKE_clange_work, + LAPACKE_clanhe, + LAPACKE_clanhe_work, + LAPACKE_clansy, + LAPACKE_clansy_work, + LAPACKE_clantr, + LAPACKE_clantr_work, + LAPACKE_clapmr, + LAPACKE_clapmr_work, + LAPACKE_clarfb, + LAPACKE_clarfb_work, + LAPACKE_clarfg, + LAPACKE_clarfg_work, + LAPACKE_clarft, + LAPACKE_clarft_work, + LAPACKE_clarfx, + LAPACKE_clarfx_work, + LAPACKE_clarnv, + LAPACKE_clarnv_work, + LAPACKE_claset, + LAPACKE_claset_work, + LAPACKE_claswp, + LAPACKE_claswp_work, + LAPACKE_clauum, + LAPACKE_clauum_work, + LAPACKE_cpbcon, + LAPACKE_cpbcon_work, + LAPACKE_cpbequ, + LAPACKE_cpbequ_work, + LAPACKE_cpbrfs, + LAPACKE_cpbrfs_work, + LAPACKE_cpbstf, + LAPACKE_cpbstf_work, + LAPACKE_cpbsv, + LAPACKE_cpbsv_work, + LAPACKE_cpbsvx, + LAPACKE_cpbsvx_work, + LAPACKE_cpbtrf, + LAPACKE_cpbtrf_work, + LAPACKE_cpbtrs, + LAPACKE_cpbtrs_work, + LAPACKE_cpftrf, + LAPACKE_cpftrf_work, + LAPACKE_cpftri, + LAPACKE_cpftri_work, + LAPACKE_cpftrs, + LAPACKE_cpftrs_work, + LAPACKE_cpocon, + LAPACKE_cpocon_work, + LAPACKE_cpoequ, + LAPACKE_cpoequ_work, + LAPACKE_cpoequb, + LAPACKE_cpoequb_work, + LAPACKE_cporfs, + LAPACKE_cporfs_work, + LAPACKE_cposv, + LAPACKE_cposv_work, + LAPACKE_cposvx, + LAPACKE_cposvx_work, + LAPACKE_cpotrf, + LAPACKE_cpotrf_work, + LAPACKE_cpotri, + LAPACKE_cpotri_work, + LAPACKE_cpotrs, + LAPACKE_cpotrs_work, + LAPACKE_cppcon, + LAPACKE_cppcon_work, + LAPACKE_cppequ, + LAPACKE_cppequ_work, + LAPACKE_cpprfs, + LAPACKE_cpprfs_work, + LAPACKE_cppsv, + LAPACKE_cppsv_work, + LAPACKE_cppsvx, + LAPACKE_cppsvx_work, + LAPACKE_cpptrf, + LAPACKE_cpptrf_work, + LAPACKE_cpptri, + LAPACKE_cpptri_work, + LAPACKE_cpptrs, + LAPACKE_cpptrs_work, + LAPACKE_cpstrf, + LAPACKE_cpstrf_work, + LAPACKE_cptcon, + LAPACKE_cptcon_work, + LAPACKE_cpteqr, + LAPACKE_cpteqr_work, + LAPACKE_cptrfs, + LAPACKE_cptrfs_work, + LAPACKE_cptsv, + LAPACKE_cptsv_work, + LAPACKE_cptsvx, + LAPACKE_cptsvx_work, + LAPACKE_cpttrf, + LAPACKE_cpttrf_work, + LAPACKE_cpttrs, + LAPACKE_cpttrs_work, + LAPACKE_cspcon, + LAPACKE_cspcon_work, + LAPACKE_csprfs, + LAPACKE_csprfs_work, + LAPACKE_cspsv, + LAPACKE_cspsv_work, + LAPACKE_cspsvx, + LAPACKE_cspsvx_work, + LAPACKE_csptrf, + LAPACKE_csptrf_work, + LAPACKE_csptri, + LAPACKE_csptri_work, + LAPACKE_csptrs, + LAPACKE_csptrs_work, + LAPACKE_cstedc, + LAPACKE_cstedc_work, + LAPACKE_cstegr, + LAPACKE_cstegr_work, + LAPACKE_cstein, + LAPACKE_cstein_work, + LAPACKE_cstemr, + LAPACKE_cstemr_work, + LAPACKE_csteqr, + LAPACKE_csteqr_work, + LAPACKE_csycon, + LAPACKE_csycon_work, + LAPACKE_csyconv, + LAPACKE_csyconv_work, + LAPACKE_csyequb, + LAPACKE_csyequb_work, + LAPACKE_csyrfs, + LAPACKE_csyrfs_work, + LAPACKE_csysv, + LAPACKE_csysv_work, + LAPACKE_csysvx, + LAPACKE_csysvx_work, + LAPACKE_csyswapr, + LAPACKE_csyswapr_work, + LAPACKE_csytrf, + LAPACKE_csytrf_work, + LAPACKE_csytri, + LAPACKE_csytri2, + LAPACKE_csytri2_work, + LAPACKE_csytri2x, + LAPACKE_csytri2x_work, + LAPACKE_csytri_work, + LAPACKE_csytrs, + LAPACKE_csytrs2, + LAPACKE_csytrs2_work, + LAPACKE_csytrs_work, + LAPACKE_ctbcon, + LAPACKE_ctbcon_work, + LAPACKE_ctbrfs, + LAPACKE_ctbrfs_work, + LAPACKE_ctbtrs, + LAPACKE_ctbtrs_work, + LAPACKE_ctfsm, + LAPACKE_ctfsm_work, + LAPACKE_ctftri, + LAPACKE_ctftri_work, + LAPACKE_ctfttp, + LAPACKE_ctfttp_work, + LAPACKE_ctfttr, + LAPACKE_ctfttr_work, + LAPACKE_ctgevc, + LAPACKE_ctgevc_work, + LAPACKE_ctgexc, + LAPACKE_ctgexc_work, + LAPACKE_ctgsen, + LAPACKE_ctgsen_work, + LAPACKE_ctgsja, + LAPACKE_ctgsja_work, + LAPACKE_ctgsna, + LAPACKE_ctgsna_work, + LAPACKE_ctgsyl, + LAPACKE_ctgsyl_work, + LAPACKE_ctpcon, + LAPACKE_ctpcon_work, + LAPACKE_ctpmqrt, + LAPACKE_ctpmqrt_work, + LAPACKE_ctpqrt, + LAPACKE_ctpqrt2, + LAPACKE_ctpqrt2_work, + LAPACKE_ctpqrt_work, + LAPACKE_ctprfb, + LAPACKE_ctprfb_work, + LAPACKE_ctprfs, + LAPACKE_ctprfs_work, + LAPACKE_ctptri, + LAPACKE_ctptri_work, + LAPACKE_ctptrs, + LAPACKE_ctptrs_work, + LAPACKE_ctpttf, + LAPACKE_ctpttf_work, + LAPACKE_ctpttr, + LAPACKE_ctpttr_work, + LAPACKE_ctrcon, + LAPACKE_ctrcon_work, + LAPACKE_ctrevc, + LAPACKE_ctrevc_work, + LAPACKE_ctrexc, + LAPACKE_ctrexc_work, + LAPACKE_ctrrfs, + LAPACKE_ctrrfs_work, + LAPACKE_ctrsen, + LAPACKE_ctrsen_work, + LAPACKE_ctrsna, + LAPACKE_ctrsna_work, + LAPACKE_ctrsyl, + LAPACKE_ctrsyl_work, + LAPACKE_ctrtri, + LAPACKE_ctrtri_work, + LAPACKE_ctrtrs, + LAPACKE_ctrtrs_work, + LAPACKE_ctrttf, + LAPACKE_ctrttf_work, + LAPACKE_ctrttp, + LAPACKE_ctrttp_work, + LAPACKE_ctzrzf, + LAPACKE_ctzrzf_work, + LAPACKE_cunbdb, + LAPACKE_cunbdb_work, + LAPACKE_cuncsd, + LAPACKE_cuncsd_work, + LAPACKE_cungbr, + LAPACKE_cungbr_work, + LAPACKE_cunghr, + LAPACKE_cunghr_work, + LAPACKE_cunglq, + LAPACKE_cunglq_work, + LAPACKE_cungql, + LAPACKE_cungql_work, + LAPACKE_cungqr, + LAPACKE_cungqr_work, + LAPACKE_cungrq, + LAPACKE_cungrq_work, + LAPACKE_cungtr, + LAPACKE_cungtr_work, + LAPACKE_cunmbr, + LAPACKE_cunmbr_work, + LAPACKE_cunmhr, + LAPACKE_cunmhr_work, + LAPACKE_cunmlq, + LAPACKE_cunmlq_work, + LAPACKE_cunmql, + LAPACKE_cunmql_work, + LAPACKE_cunmqr, + LAPACKE_cunmqr_work, + LAPACKE_cunmrq, + LAPACKE_cunmrq_work, + LAPACKE_cunmrz, + LAPACKE_cunmrz_work, + LAPACKE_cunmtr, + LAPACKE_cunmtr_work, + LAPACKE_cupgtr, + LAPACKE_cupgtr_work, + LAPACKE_cupmtr, + LAPACKE_cupmtr_work, + LAPACKE_dbbcsd, + LAPACKE_dbbcsd_work, + LAPACKE_dbdsdc, + LAPACKE_dbdsdc_work, + LAPACKE_dbdsqr, + LAPACKE_dbdsqr_work, + LAPACKE_ddisna, + LAPACKE_ddisna_work, + LAPACKE_dgbbrd, + LAPACKE_dgbbrd_work, + LAPACKE_dgbcon, + LAPACKE_dgbcon_work, + LAPACKE_dgbequ, + LAPACKE_dgbequ_work, + LAPACKE_dgbequb, + LAPACKE_dgbequb_work, + LAPACKE_dgbrfs, + LAPACKE_dgbrfs_work, + LAPACKE_dgbsv, + LAPACKE_dgbsv_work, + LAPACKE_dgbsvx, + LAPACKE_dgbsvx_work, + LAPACKE_dgbtrf, + LAPACKE_dgbtrf_work, + LAPACKE_dgbtrs, + LAPACKE_dgbtrs_work, + LAPACKE_dgebak, + LAPACKE_dgebak_work, + LAPACKE_dgebal, + LAPACKE_dgebal_work, + LAPACKE_dgebrd, + LAPACKE_dgebrd_work, + LAPACKE_dgecon, + LAPACKE_dgecon_work, + LAPACKE_dgeequ, + LAPACKE_dgeequ_work, + LAPACKE_dgeequb, + LAPACKE_dgeequb_work, + LAPACKE_dgees, + LAPACKE_dgees_work, + LAPACKE_dgeesx, + LAPACKE_dgeesx_work, + LAPACKE_dgeev, + LAPACKE_dgeev_work, + LAPACKE_dgeevx, + LAPACKE_dgeevx_work, + LAPACKE_dgehrd, + LAPACKE_dgehrd_work, + LAPACKE_dgejsv, + LAPACKE_dgejsv_work, + LAPACKE_dgelq2, + LAPACKE_dgelq2_work, + LAPACKE_dgelqf, + LAPACKE_dgelqf_work, + LAPACKE_dgels, + LAPACKE_dgels_work, + LAPACKE_dgelsd, + LAPACKE_dgelsd_work, + LAPACKE_dgelss, + LAPACKE_dgelss_work, + LAPACKE_dgelsy, + LAPACKE_dgelsy_work, + LAPACKE_dgemqrt, + LAPACKE_dgemqrt_work, + LAPACKE_dgeqlf, + LAPACKE_dgeqlf_work, + LAPACKE_dgeqp3, + LAPACKE_dgeqp3_work, + LAPACKE_dgeqpf, + LAPACKE_dgeqpf_work, + LAPACKE_dgeqr2, + LAPACKE_dgeqr2_work, + LAPACKE_dgeqrf, + LAPACKE_dgeqrf_work, + LAPACKE_dgeqrfp, + LAPACKE_dgeqrfp_work, + LAPACKE_dgeqrt, + LAPACKE_dgeqrt2, + LAPACKE_dgeqrt2_work, + LAPACKE_dgeqrt3, + LAPACKE_dgeqrt3_work, + LAPACKE_dgeqrt_work, + LAPACKE_dgerfs, + LAPACKE_dgerfs_work, + LAPACKE_dgerqf, + LAPACKE_dgerqf_work, + LAPACKE_dgesdd, + LAPACKE_dgesdd_work, + LAPACKE_dgesv, + LAPACKE_dgesv_work, + LAPACKE_dgesvd, + LAPACKE_dgesvd_work, + LAPACKE_dgesvj, + LAPACKE_dgesvj_work, + LAPACKE_dgesvx, + LAPACKE_dgesvx_work, + LAPACKE_dgetf2, + LAPACKE_dgetf2_work, + LAPACKE_dgetrf, + LAPACKE_dgetrf_work, + LAPACKE_dgetri, + LAPACKE_dgetri_work, + LAPACKE_dgetrs, + LAPACKE_dgetrs_work, + LAPACKE_dggbak, + LAPACKE_dggbak_work, + LAPACKE_dggbal, + LAPACKE_dggbal_work, + LAPACKE_dgges, + LAPACKE_dgges_work, + LAPACKE_dggesx, + LAPACKE_dggesx_work, + LAPACKE_dggev, + LAPACKE_dggev_work, + LAPACKE_dggevx, + LAPACKE_dggevx_work, + LAPACKE_dggglm, + LAPACKE_dggglm_work, + LAPACKE_dgghrd, + LAPACKE_dgghrd_work, + LAPACKE_dgglse, + LAPACKE_dgglse_work, + LAPACKE_dggqrf, + LAPACKE_dggqrf_work, + LAPACKE_dggrqf, + LAPACKE_dggrqf_work, + LAPACKE_dggsvd, + LAPACKE_dggsvd_work, + LAPACKE_dggsvp, + LAPACKE_dggsvp_work, + LAPACKE_dgtcon, + LAPACKE_dgtcon_work, + LAPACKE_dgtrfs, + LAPACKE_dgtrfs_work, + LAPACKE_dgtsv, + LAPACKE_dgtsv_work, + LAPACKE_dgtsvx, + LAPACKE_dgtsvx_work, + LAPACKE_dgttrf, + LAPACKE_dgttrf_work, + LAPACKE_dgttrs, + LAPACKE_dgttrs_work, + LAPACKE_dhgeqz, + LAPACKE_dhgeqz_work, + LAPACKE_dhsein, + LAPACKE_dhsein_work, + LAPACKE_dhseqr, + LAPACKE_dhseqr_work, + LAPACKE_dlacpy, + LAPACKE_dlacpy_work, + LAPACKE_dlag2s, + LAPACKE_dlag2s_work, + LAPACKE_dlamch, + LAPACKE_dlamch_work, + LAPACKE_dlange, + LAPACKE_dlange_work, + LAPACKE_dlansy, + LAPACKE_dlansy_work, + LAPACKE_dlantr, + LAPACKE_dlantr_work, + LAPACKE_dlapmr, + LAPACKE_dlapmr_work, + LAPACKE_dlapy2, + LAPACKE_dlapy2_work, + LAPACKE_dlapy3, + LAPACKE_dlapy3_work, + LAPACKE_dlarfb, + LAPACKE_dlarfb_work, + LAPACKE_dlarfg, + LAPACKE_dlarfg_work, + LAPACKE_dlarft, + LAPACKE_dlarft_work, + LAPACKE_dlarfx, + LAPACKE_dlarfx_work, + LAPACKE_dlarnv, + LAPACKE_dlarnv_work, + LAPACKE_dlartgp, + LAPACKE_dlartgp_work, + LAPACKE_dlartgs, + LAPACKE_dlartgs_work, + LAPACKE_dlaset, + LAPACKE_dlaset_work, + LAPACKE_dlasrt, + LAPACKE_dlasrt_work, + LAPACKE_dlaswp, + LAPACKE_dlaswp_work, + LAPACKE_dlauum, + LAPACKE_dlauum_work, + LAPACKE_dopgtr, + LAPACKE_dopgtr_work, + LAPACKE_dopmtr, + LAPACKE_dopmtr_work, + LAPACKE_dorbdb, + LAPACKE_dorbdb_work, + LAPACKE_dorcsd, + LAPACKE_dorcsd_work, + LAPACKE_dorgbr, + LAPACKE_dorgbr_work, + LAPACKE_dorghr, + LAPACKE_dorghr_work, + LAPACKE_dorglq, + LAPACKE_dorglq_work, + LAPACKE_dorgql, + LAPACKE_dorgql_work, + LAPACKE_dorgqr, + LAPACKE_dorgqr_work, + LAPACKE_dorgrq, + LAPACKE_dorgrq_work, + LAPACKE_dorgtr, + LAPACKE_dorgtr_work, + LAPACKE_dormbr, + LAPACKE_dormbr_work, + LAPACKE_dormhr, + LAPACKE_dormhr_work, + LAPACKE_dormlq, + LAPACKE_dormlq_work, + LAPACKE_dormql, + LAPACKE_dormql_work, + LAPACKE_dormqr, + LAPACKE_dormqr_work, + LAPACKE_dormrq, + LAPACKE_dormrq_work, + LAPACKE_dormrz, + LAPACKE_dormrz_work, + LAPACKE_dormtr, + LAPACKE_dormtr_work, + LAPACKE_dpbcon, + LAPACKE_dpbcon_work, + LAPACKE_dpbequ, + LAPACKE_dpbequ_work, + LAPACKE_dpbrfs, + LAPACKE_dpbrfs_work, + LAPACKE_dpbstf, + LAPACKE_dpbstf_work, + LAPACKE_dpbsv, + LAPACKE_dpbsv_work, + LAPACKE_dpbsvx, + LAPACKE_dpbsvx_work, + LAPACKE_dpbtrf, + LAPACKE_dpbtrf_work, + LAPACKE_dpbtrs, + LAPACKE_dpbtrs_work, + LAPACKE_dpftrf, + LAPACKE_dpftrf_work, + LAPACKE_dpftri, + LAPACKE_dpftri_work, + LAPACKE_dpftrs, + LAPACKE_dpftrs_work, + LAPACKE_dpocon, + LAPACKE_dpocon_work, + LAPACKE_dpoequ, + LAPACKE_dpoequ_work, + LAPACKE_dpoequb, + LAPACKE_dpoequb_work, + LAPACKE_dporfs, + LAPACKE_dporfs_work, + LAPACKE_dposv, + LAPACKE_dposv_work, + LAPACKE_dposvx, + LAPACKE_dposvx_work, + LAPACKE_dpotrf, + LAPACKE_dpotrf_work, + LAPACKE_dpotri, + LAPACKE_dpotri_work, + LAPACKE_dpotrs, + LAPACKE_dpotrs_work, + LAPACKE_dppcon, + LAPACKE_dppcon_work, + LAPACKE_dppequ, + LAPACKE_dppequ_work, + LAPACKE_dpprfs, + LAPACKE_dpprfs_work, + LAPACKE_dppsv, + LAPACKE_dppsv_work, + LAPACKE_dppsvx, + LAPACKE_dppsvx_work, + LAPACKE_dpptrf, + LAPACKE_dpptrf_work, + LAPACKE_dpptri, + LAPACKE_dpptri_work, + LAPACKE_dpptrs, + LAPACKE_dpptrs_work, + LAPACKE_dpstrf, + LAPACKE_dpstrf_work, + LAPACKE_dptcon, + LAPACKE_dptcon_work, + LAPACKE_dpteqr, + LAPACKE_dpteqr_work, + LAPACKE_dptrfs, + LAPACKE_dptrfs_work, + LAPACKE_dptsv, + LAPACKE_dptsv_work, + LAPACKE_dptsvx, + LAPACKE_dptsvx_work, + LAPACKE_dpttrf, + LAPACKE_dpttrf_work, + LAPACKE_dpttrs, + LAPACKE_dpttrs_work, + LAPACKE_dsbev, + LAPACKE_dsbev_work, + LAPACKE_dsbevd, + LAPACKE_dsbevd_work, + LAPACKE_dsbevx, + LAPACKE_dsbevx_work, + LAPACKE_dsbgst, + LAPACKE_dsbgst_work, + LAPACKE_dsbgv, + LAPACKE_dsbgv_work, + LAPACKE_dsbgvd, + LAPACKE_dsbgvd_work, + LAPACKE_dsbgvx, + LAPACKE_dsbgvx_work, + LAPACKE_dsbtrd, + LAPACKE_dsbtrd_work, + LAPACKE_dsfrk, + LAPACKE_dsfrk_work, + LAPACKE_dsgesv, + LAPACKE_dsgesv_work, + LAPACKE_dspcon, + LAPACKE_dspcon_work, + LAPACKE_dspev, + LAPACKE_dspev_work, + LAPACKE_dspevd, + LAPACKE_dspevd_work, + LAPACKE_dspevx, + LAPACKE_dspevx_work, + LAPACKE_dspgst, + LAPACKE_dspgst_work, + LAPACKE_dspgv, + LAPACKE_dspgv_work, + LAPACKE_dspgvd, + LAPACKE_dspgvd_work, + LAPACKE_dspgvx, + LAPACKE_dspgvx_work, + LAPACKE_dsposv, + LAPACKE_dsposv_work, + LAPACKE_dsprfs, + LAPACKE_dsprfs_work, + LAPACKE_dspsv, + LAPACKE_dspsv_work, + LAPACKE_dspsvx, + LAPACKE_dspsvx_work, + LAPACKE_dsptrd, + LAPACKE_dsptrd_work, + LAPACKE_dsptrf, + LAPACKE_dsptrf_work, + LAPACKE_dsptri, + LAPACKE_dsptri_work, + LAPACKE_dsptrs, + LAPACKE_dsptrs_work, + LAPACKE_dstebz, + LAPACKE_dstebz_work, + LAPACKE_dstedc, + LAPACKE_dstedc_work, + LAPACKE_dstegr, + LAPACKE_dstegr_work, + LAPACKE_dstein, + LAPACKE_dstein_work, + LAPACKE_dstemr, + LAPACKE_dstemr_work, + LAPACKE_dsteqr, + LAPACKE_dsteqr_work, + LAPACKE_dsterf, + LAPACKE_dsterf_work, + LAPACKE_dstev, + LAPACKE_dstev_work, + LAPACKE_dstevd, + LAPACKE_dstevd_work, + LAPACKE_dstevr, + LAPACKE_dstevr_work, + LAPACKE_dstevx, + LAPACKE_dstevx_work, + LAPACKE_dsycon, + LAPACKE_dsycon_work, + LAPACKE_dsyconv, + LAPACKE_dsyconv_work, + LAPACKE_dsyequb, + LAPACKE_dsyequb_work, + LAPACKE_dsyev, + LAPACKE_dsyev_work, + LAPACKE_dsyevd, + LAPACKE_dsyevd_work, + LAPACKE_dsyevr, + LAPACKE_dsyevr_work, + LAPACKE_dsyevx, + LAPACKE_dsyevx_work, + LAPACKE_dsygst, + LAPACKE_dsygst_work, + LAPACKE_dsygv, + LAPACKE_dsygv_work, + LAPACKE_dsygvd, + LAPACKE_dsygvd_work, + LAPACKE_dsygvx, + LAPACKE_dsygvx_work, + LAPACKE_dsyrfs, + LAPACKE_dsyrfs_work, + LAPACKE_dsysv, + LAPACKE_dsysv_work, + LAPACKE_dsysvx, + LAPACKE_dsysvx_work, + LAPACKE_dsyswapr, + LAPACKE_dsyswapr_work, + LAPACKE_dsytrd, + LAPACKE_dsytrd_work, + LAPACKE_dsytrf, + LAPACKE_dsytrf_work, + LAPACKE_dsytri, + LAPACKE_dsytri2, + LAPACKE_dsytri2_work, + LAPACKE_dsytri2x, + LAPACKE_dsytri2x_work, + LAPACKE_dsytri_work, + LAPACKE_dsytrs, + LAPACKE_dsytrs2, + LAPACKE_dsytrs2_work, + LAPACKE_dsytrs_work, + LAPACKE_dtbcon, + LAPACKE_dtbcon_work, + LAPACKE_dtbrfs, + LAPACKE_dtbrfs_work, + LAPACKE_dtbtrs, + LAPACKE_dtbtrs_work, + LAPACKE_dtfsm, + LAPACKE_dtfsm_work, + LAPACKE_dtftri, + LAPACKE_dtftri_work, + LAPACKE_dtfttp, + LAPACKE_dtfttp_work, + LAPACKE_dtfttr, + LAPACKE_dtfttr_work, + LAPACKE_dtgevc, + LAPACKE_dtgevc_work, + LAPACKE_dtgexc, + LAPACKE_dtgexc_work, + LAPACKE_dtgsen, + LAPACKE_dtgsen_work, + LAPACKE_dtgsja, + LAPACKE_dtgsja_work, + LAPACKE_dtgsna, + LAPACKE_dtgsna_work, + LAPACKE_dtgsyl, + LAPACKE_dtgsyl_work, + LAPACKE_dtpcon, + LAPACKE_dtpcon_work, + LAPACKE_dtpmqrt, + LAPACKE_dtpmqrt_work, + LAPACKE_dtpqrt, + LAPACKE_dtpqrt2, + LAPACKE_dtpqrt2_work, + LAPACKE_dtpqrt_work, + LAPACKE_dtprfb, + LAPACKE_dtprfb_work, + LAPACKE_dtprfs, + LAPACKE_dtprfs_work, + LAPACKE_dtptri, + LAPACKE_dtptri_work, + LAPACKE_dtptrs, + LAPACKE_dtptrs_work, + LAPACKE_dtpttf, + LAPACKE_dtpttf_work, + LAPACKE_dtpttr, + LAPACKE_dtpttr_work, + LAPACKE_dtrcon, + LAPACKE_dtrcon_work, + LAPACKE_dtrevc, + LAPACKE_dtrevc_work, + LAPACKE_dtrexc, + LAPACKE_dtrexc_work, + LAPACKE_dtrrfs, + LAPACKE_dtrrfs_work, + LAPACKE_dtrsen, + LAPACKE_dtrsen_work, + LAPACKE_dtrsna, + LAPACKE_dtrsna_work, + LAPACKE_dtrsyl, + LAPACKE_dtrsyl_work, + LAPACKE_dtrtri, + LAPACKE_dtrtri_work, + LAPACKE_dtrtrs, + LAPACKE_dtrtrs_work, + LAPACKE_dtrttf, + LAPACKE_dtrttf_work, + LAPACKE_dtrttp, + LAPACKE_dtrttp_work, + LAPACKE_dtzrzf, + LAPACKE_dtzrzf_work, + LAPACKE_sbbcsd, + LAPACKE_sbbcsd_work, + LAPACKE_sbdsdc, + LAPACKE_sbdsdc_work, + LAPACKE_sbdsqr, + LAPACKE_sbdsqr_work, + LAPACKE_sdisna, + LAPACKE_sdisna_work, + LAPACKE_sgbbrd, + LAPACKE_sgbbrd_work, + LAPACKE_sgbcon, + LAPACKE_sgbcon_work, + LAPACKE_sgbequ, + LAPACKE_sgbequ_work, + LAPACKE_sgbequb, + LAPACKE_sgbequb_work, + LAPACKE_sgbrfs, + LAPACKE_sgbrfs_work, + LAPACKE_sgbsv, + LAPACKE_sgbsv_work, + LAPACKE_sgbsvx, + LAPACKE_sgbsvx_work, + LAPACKE_sgbtrf, + LAPACKE_sgbtrf_work, + LAPACKE_sgbtrs, + LAPACKE_sgbtrs_work, + LAPACKE_sgebak, + LAPACKE_sgebak_work, + LAPACKE_sgebal, + LAPACKE_sgebal_work, + LAPACKE_sgebrd, + LAPACKE_sgebrd_work, + LAPACKE_sgecon, + LAPACKE_sgecon_work, + LAPACKE_sgeequ, + LAPACKE_sgeequ_work, + LAPACKE_sgeequb, + LAPACKE_sgeequb_work, + LAPACKE_sgees, + LAPACKE_sgees_work, + LAPACKE_sgeesx, + LAPACKE_sgeesx_work, + LAPACKE_sgeev, + LAPACKE_sgeev_work, + LAPACKE_sgeevx, + LAPACKE_sgeevx_work, + LAPACKE_sgehrd, + LAPACKE_sgehrd_work, + LAPACKE_sgejsv, + LAPACKE_sgejsv_work, + LAPACKE_sgelq2, + LAPACKE_sgelq2_work, + LAPACKE_sgelqf, + LAPACKE_sgelqf_work, + LAPACKE_sgels, + LAPACKE_sgels_work, + LAPACKE_sgelsd, + LAPACKE_sgelsd_work, + LAPACKE_sgelss, + LAPACKE_sgelss_work, + LAPACKE_sgelsy, + LAPACKE_sgelsy_work, + LAPACKE_sgemqrt, + LAPACKE_sgemqrt_work, + LAPACKE_sgeqlf, + LAPACKE_sgeqlf_work, + LAPACKE_sgeqp3, + LAPACKE_sgeqp3_work, + LAPACKE_sgeqpf, + LAPACKE_sgeqpf_work, + LAPACKE_sgeqr2, + LAPACKE_sgeqr2_work, + LAPACKE_sgeqrf, + LAPACKE_sgeqrf_work, + LAPACKE_sgeqrfp, + LAPACKE_sgeqrfp_work, + LAPACKE_sgeqrt, + LAPACKE_sgeqrt2, + LAPACKE_sgeqrt2_work, + LAPACKE_sgeqrt3, + LAPACKE_sgeqrt3_work, + LAPACKE_sgeqrt_work, + LAPACKE_sgerfs, + LAPACKE_sgerfs_work, + LAPACKE_sgerqf, + LAPACKE_sgerqf_work, + LAPACKE_sgesdd, + LAPACKE_sgesdd_work, + LAPACKE_sgesv, + LAPACKE_sgesv_work, + LAPACKE_sgesvd, + LAPACKE_sgesvd_work, + LAPACKE_sgesvj, + LAPACKE_sgesvj_work, + LAPACKE_sgesvx, + LAPACKE_sgesvx_work, + LAPACKE_sgetf2, + LAPACKE_sgetf2_work, + LAPACKE_sgetrf, + LAPACKE_sgetrf_work, + LAPACKE_sgetri, + LAPACKE_sgetri_work, + LAPACKE_sgetrs, + LAPACKE_sgetrs_work, + LAPACKE_sggbak, + LAPACKE_sggbak_work, + LAPACKE_sggbal, + LAPACKE_sggbal_work, + LAPACKE_sgges, + LAPACKE_sgges_work, + LAPACKE_sggesx, + LAPACKE_sggesx_work, + LAPACKE_sggev, + LAPACKE_sggev_work, + LAPACKE_sggevx, + LAPACKE_sggevx_work, + LAPACKE_sggglm, + LAPACKE_sggglm_work, + LAPACKE_sgghrd, + LAPACKE_sgghrd_work, + LAPACKE_sgglse, + LAPACKE_sgglse_work, + LAPACKE_sggqrf, + LAPACKE_sggqrf_work, + LAPACKE_sggrqf, + LAPACKE_sggrqf_work, + LAPACKE_sggsvd, + LAPACKE_sggsvd_work, + LAPACKE_sggsvp, + LAPACKE_sggsvp_work, + LAPACKE_sgtcon, + LAPACKE_sgtcon_work, + LAPACKE_sgtrfs, + LAPACKE_sgtrfs_work, + LAPACKE_sgtsv, + LAPACKE_sgtsv_work, + LAPACKE_sgtsvx, + LAPACKE_sgtsvx_work, + LAPACKE_sgttrf, + LAPACKE_sgttrf_work, + LAPACKE_sgttrs, + LAPACKE_sgttrs_work, + LAPACKE_shgeqz, + LAPACKE_shgeqz_work, + LAPACKE_shsein, + LAPACKE_shsein_work, + LAPACKE_shseqr, + LAPACKE_shseqr_work, + LAPACKE_slacpy, + LAPACKE_slacpy_work, + LAPACKE_slag2d, + LAPACKE_slag2d_work, + LAPACKE_slamch, + LAPACKE_slamch_work, + LAPACKE_slange, + LAPACKE_slange_work, + LAPACKE_slansy, + LAPACKE_slansy_work, + LAPACKE_slantr, + LAPACKE_slantr_work, + LAPACKE_slapmr, + LAPACKE_slapmr_work, + LAPACKE_slapy2, + LAPACKE_slapy2_work, + LAPACKE_slapy3, + LAPACKE_slapy3_work, + LAPACKE_slarfb, + LAPACKE_slarfb_work, + LAPACKE_slarfg, + LAPACKE_slarfg_work, + LAPACKE_slarft, + LAPACKE_slarft_work, + LAPACKE_slarfx, + LAPACKE_slarfx_work, + LAPACKE_slarnv, + LAPACKE_slarnv_work, + LAPACKE_slartgp, + LAPACKE_slartgp_work, + LAPACKE_slartgs, + LAPACKE_slartgs_work, + LAPACKE_slaset, + LAPACKE_slaset_work, + LAPACKE_slasrt, + LAPACKE_slasrt_work, + LAPACKE_slaswp, + LAPACKE_slaswp_work, + LAPACKE_slauum, + LAPACKE_slauum_work, + LAPACKE_sopgtr, + LAPACKE_sopgtr_work, + LAPACKE_sopmtr, + LAPACKE_sopmtr_work, + LAPACKE_sorbdb, + LAPACKE_sorbdb_work, + LAPACKE_sorcsd, + LAPACKE_sorcsd_work, + LAPACKE_sorgbr, + LAPACKE_sorgbr_work, + LAPACKE_sorghr, + LAPACKE_sorghr_work, + LAPACKE_sorglq, + LAPACKE_sorglq_work, + LAPACKE_sorgql, + LAPACKE_sorgql_work, + LAPACKE_sorgqr, + LAPACKE_sorgqr_work, + LAPACKE_sorgrq, + LAPACKE_sorgrq_work, + LAPACKE_sorgtr, + LAPACKE_sorgtr_work, + LAPACKE_sormbr, + LAPACKE_sormbr_work, + LAPACKE_sormhr, + LAPACKE_sormhr_work, + LAPACKE_sormlq, + LAPACKE_sormlq_work, + LAPACKE_sormql, + LAPACKE_sormql_work, + LAPACKE_sormqr, + LAPACKE_sormqr_work, + LAPACKE_sormrq, + LAPACKE_sormrq_work, + LAPACKE_sormrz, + LAPACKE_sormrz_work, + LAPACKE_sormtr, + LAPACKE_sormtr_work, + LAPACKE_spbcon, + LAPACKE_spbcon_work, + LAPACKE_spbequ, + LAPACKE_spbequ_work, + LAPACKE_spbrfs, + LAPACKE_spbrfs_work, + LAPACKE_spbstf, + LAPACKE_spbstf_work, + LAPACKE_spbsv, + LAPACKE_spbsv_work, + LAPACKE_spbsvx, + LAPACKE_spbsvx_work, + LAPACKE_spbtrf, + LAPACKE_spbtrf_work, + LAPACKE_spbtrs, + LAPACKE_spbtrs_work, + LAPACKE_spftrf, + LAPACKE_spftrf_work, + LAPACKE_spftri, + LAPACKE_spftri_work, + LAPACKE_spftrs, + LAPACKE_spftrs_work, + LAPACKE_spocon, + LAPACKE_spocon_work, + LAPACKE_spoequ, + LAPACKE_spoequ_work, + LAPACKE_spoequb, + LAPACKE_spoequb_work, + LAPACKE_sporfs, + LAPACKE_sporfs_work, + LAPACKE_sposv, + LAPACKE_sposv_work, + LAPACKE_sposvx, + LAPACKE_sposvx_work, + LAPACKE_spotrf, + LAPACKE_spotrf_work, + LAPACKE_spotri, + LAPACKE_spotri_work, + LAPACKE_spotrs, + LAPACKE_spotrs_work, + LAPACKE_sppcon, + LAPACKE_sppcon_work, + LAPACKE_sppequ, + LAPACKE_sppequ_work, + LAPACKE_spprfs, + LAPACKE_spprfs_work, + LAPACKE_sppsv, + LAPACKE_sppsv_work, + LAPACKE_sppsvx, + LAPACKE_sppsvx_work, + LAPACKE_spptrf, + LAPACKE_spptrf_work, + LAPACKE_spptri, + LAPACKE_spptri_work, + LAPACKE_spptrs, + LAPACKE_spptrs_work, + LAPACKE_spstrf, + LAPACKE_spstrf_work, + LAPACKE_sptcon, + LAPACKE_sptcon_work, + LAPACKE_spteqr, + LAPACKE_spteqr_work, + LAPACKE_sptrfs, + LAPACKE_sptrfs_work, + LAPACKE_sptsv, + LAPACKE_sptsv_work, + LAPACKE_sptsvx, + LAPACKE_sptsvx_work, + LAPACKE_spttrf, + LAPACKE_spttrf_work, + LAPACKE_spttrs, + LAPACKE_spttrs_work, + LAPACKE_ssbev, + LAPACKE_ssbev_work, + LAPACKE_ssbevd, + LAPACKE_ssbevd_work, + LAPACKE_ssbevx, + LAPACKE_ssbevx_work, + LAPACKE_ssbgst, + LAPACKE_ssbgst_work, + LAPACKE_ssbgv, + LAPACKE_ssbgv_work, + LAPACKE_ssbgvd, + LAPACKE_ssbgvd_work, + LAPACKE_ssbgvx, + LAPACKE_ssbgvx_work, + LAPACKE_ssbtrd, + LAPACKE_ssbtrd_work, + LAPACKE_ssfrk, + LAPACKE_ssfrk_work, + LAPACKE_sspcon, + LAPACKE_sspcon_work, + LAPACKE_sspev, + LAPACKE_sspev_work, + LAPACKE_sspevd, + LAPACKE_sspevd_work, + LAPACKE_sspevx, + LAPACKE_sspevx_work, + LAPACKE_sspgst, + LAPACKE_sspgst_work, + LAPACKE_sspgv, + LAPACKE_sspgv_work, + LAPACKE_sspgvd, + LAPACKE_sspgvd_work, + LAPACKE_sspgvx, + LAPACKE_sspgvx_work, + LAPACKE_ssprfs, + LAPACKE_ssprfs_work, + LAPACKE_sspsv, + LAPACKE_sspsv_work, + LAPACKE_sspsvx, + LAPACKE_sspsvx_work, + LAPACKE_ssptrd, + LAPACKE_ssptrd_work, + LAPACKE_ssptrf, + LAPACKE_ssptrf_work, + LAPACKE_ssptri, + LAPACKE_ssptri_work, + LAPACKE_ssptrs, + LAPACKE_ssptrs_work, + LAPACKE_sstebz, + LAPACKE_sstebz_work, + LAPACKE_sstedc, + LAPACKE_sstedc_work, + LAPACKE_sstegr, + LAPACKE_sstegr_work, + LAPACKE_sstein, + LAPACKE_sstein_work, + LAPACKE_sstemr, + LAPACKE_sstemr_work, + LAPACKE_ssteqr, + LAPACKE_ssteqr_work, + LAPACKE_ssterf, + LAPACKE_ssterf_work, + LAPACKE_sstev, + LAPACKE_sstev_work, + LAPACKE_sstevd, + LAPACKE_sstevd_work, + LAPACKE_sstevr, + LAPACKE_sstevr_work, + LAPACKE_sstevx, + LAPACKE_sstevx_work, + LAPACKE_ssycon, + LAPACKE_ssycon_work, + LAPACKE_ssyconv, + LAPACKE_ssyconv_work, + LAPACKE_ssyequb, + LAPACKE_ssyequb_work, + LAPACKE_ssyev, + LAPACKE_ssyev_work, + LAPACKE_ssyevd, + LAPACKE_ssyevd_work, + LAPACKE_ssyevr, + LAPACKE_ssyevr_work, + LAPACKE_ssyevx, + LAPACKE_ssyevx_work, + LAPACKE_ssygst, + LAPACKE_ssygst_work, + LAPACKE_ssygv, + LAPACKE_ssygv_work, + LAPACKE_ssygvd, + LAPACKE_ssygvd_work, + LAPACKE_ssygvx, + LAPACKE_ssygvx_work, + LAPACKE_ssyrfs, + LAPACKE_ssyrfs_work, + LAPACKE_ssysv, + LAPACKE_ssysv_work, + LAPACKE_ssysvx, + LAPACKE_ssysvx_work, + LAPACKE_ssyswapr, + LAPACKE_ssyswapr_work, + LAPACKE_ssytrd, + LAPACKE_ssytrd_work, + LAPACKE_ssytrf, + LAPACKE_ssytrf_work, + LAPACKE_ssytri, + LAPACKE_ssytri2, + LAPACKE_ssytri2_work, + LAPACKE_ssytri2x, + LAPACKE_ssytri2x_work, + LAPACKE_ssytri_work, + LAPACKE_ssytrs, + LAPACKE_ssytrs2, + LAPACKE_ssytrs2_work, + LAPACKE_ssytrs_work, + LAPACKE_stbcon, + LAPACKE_stbcon_work, + LAPACKE_stbrfs, + LAPACKE_stbrfs_work, + LAPACKE_stbtrs, + LAPACKE_stbtrs_work, + LAPACKE_stfsm, + LAPACKE_stfsm_work, + LAPACKE_stftri, + LAPACKE_stftri_work, + LAPACKE_stfttp, + LAPACKE_stfttp_work, + LAPACKE_stfttr, + LAPACKE_stfttr_work, + LAPACKE_stgevc, + LAPACKE_stgevc_work, + LAPACKE_stgexc, + LAPACKE_stgexc_work, + LAPACKE_stgsen, + LAPACKE_stgsen_work, + LAPACKE_stgsja, + LAPACKE_stgsja_work, + LAPACKE_stgsna, + LAPACKE_stgsna_work, + LAPACKE_stgsyl, + LAPACKE_stgsyl_work, + LAPACKE_stpcon, + LAPACKE_stpcon_work, + LAPACKE_stpmqrt, + LAPACKE_stpmqrt_work, + LAPACKE_stpqrt2, + LAPACKE_stpqrt2_work, + LAPACKE_stprfb, + LAPACKE_stprfb_work, + LAPACKE_stprfs, + LAPACKE_stprfs_work, + LAPACKE_stptri, + LAPACKE_stptri_work, + LAPACKE_stptrs, + LAPACKE_stptrs_work, + LAPACKE_stpttf, + LAPACKE_stpttf_work, + LAPACKE_stpttr, + LAPACKE_stpttr_work, + LAPACKE_strcon, + LAPACKE_strcon_work, + LAPACKE_strevc, + LAPACKE_strevc_work, + LAPACKE_strexc, + LAPACKE_strexc_work, + LAPACKE_strrfs, + LAPACKE_strrfs_work, + LAPACKE_strsen, + LAPACKE_strsen_work, + LAPACKE_strsna, + LAPACKE_strsna_work, + LAPACKE_strsyl, + LAPACKE_strsyl_work, + LAPACKE_strtri, + LAPACKE_strtri_work, + LAPACKE_strtrs, + LAPACKE_strtrs_work, + LAPACKE_strttf, + LAPACKE_strttf_work, + LAPACKE_strttp, + LAPACKE_strttp_work, + LAPACKE_stzrzf, + LAPACKE_stzrzf_work, + LAPACKE_zbbcsd, + LAPACKE_zbbcsd_work, + LAPACKE_zbdsqr, + LAPACKE_zbdsqr_work, + LAPACKE_zcgesv, + LAPACKE_zcgesv_work, + LAPACKE_zcposv, + LAPACKE_zcposv_work, + LAPACKE_zgbbrd, + LAPACKE_zgbbrd_work, + LAPACKE_zgbcon, + LAPACKE_zgbcon_work, + LAPACKE_zgbequ, + LAPACKE_zgbequ_work, + LAPACKE_zgbequb, + LAPACKE_zgbequb_work, + LAPACKE_zgbrfs, + LAPACKE_zgbrfs_work, + LAPACKE_zgbsv, + LAPACKE_zgbsv_work, + LAPACKE_zgbsvx, + LAPACKE_zgbsvx_work, + LAPACKE_zgbtrf, + LAPACKE_zgbtrf_work, + LAPACKE_zgbtrs, + LAPACKE_zgbtrs_work, + LAPACKE_zgebak, + LAPACKE_zgebak_work, + LAPACKE_zgebal, + LAPACKE_zgebal_work, + LAPACKE_zgebrd, + LAPACKE_zgebrd_work, + LAPACKE_zgecon, + LAPACKE_zgecon_work, + LAPACKE_zgeequ, + LAPACKE_zgeequ_work, + LAPACKE_zgeequb, + LAPACKE_zgeequb_work, + LAPACKE_zgees, + LAPACKE_zgees_work, + LAPACKE_zgeesx, + LAPACKE_zgeesx_work, + LAPACKE_zgeev, + LAPACKE_zgeev_work, + LAPACKE_zgeevx, + LAPACKE_zgeevx_work, + LAPACKE_zgehrd, + LAPACKE_zgehrd_work, + LAPACKE_zgelq2, + LAPACKE_zgelq2_work, + LAPACKE_zgelqf, + LAPACKE_zgelqf_work, + LAPACKE_zgels, + LAPACKE_zgels_work, + LAPACKE_zgelsd, + LAPACKE_zgelsd_work, + LAPACKE_zgelss, + LAPACKE_zgelss_work, + LAPACKE_zgelsy, + LAPACKE_zgelsy_work, + LAPACKE_zgemqrt, + LAPACKE_zgemqrt_work, + LAPACKE_zgeqlf, + LAPACKE_zgeqlf_work, + LAPACKE_zgeqp3, + LAPACKE_zgeqp3_work, + LAPACKE_zgeqpf, + LAPACKE_zgeqpf_work, + LAPACKE_zgeqr2, + LAPACKE_zgeqr2_work, + LAPACKE_zgeqrf, + LAPACKE_zgeqrf_work, + LAPACKE_zgeqrfp, + LAPACKE_zgeqrfp_work, + LAPACKE_zgeqrt, + LAPACKE_zgeqrt2, + LAPACKE_zgeqrt2_work, + LAPACKE_zgeqrt3, + LAPACKE_zgeqrt3_work, + LAPACKE_zgeqrt_work, + LAPACKE_zgerfs, + LAPACKE_zgerfs_work, + LAPACKE_zgerqf, + LAPACKE_zgerqf_work, + LAPACKE_zgesdd, + LAPACKE_zgesdd_work, + LAPACKE_zgesv, + LAPACKE_zgesv_work, + LAPACKE_zgesvd, + LAPACKE_zgesvd_work, + LAPACKE_zgesvx, + LAPACKE_zgesvx_work, + LAPACKE_zgetf2, + LAPACKE_zgetf2_work, + LAPACKE_zgetrf, + LAPACKE_zgetrf_work, + LAPACKE_zgetri, + LAPACKE_zgetri_work, + LAPACKE_zgetrs, + LAPACKE_zgetrs_work, + LAPACKE_zggbak, + LAPACKE_zggbak_work, + LAPACKE_zggbal, + LAPACKE_zggbal_work, + LAPACKE_zgges, + LAPACKE_zgges_work, + LAPACKE_zggesx, + LAPACKE_zggesx_work, + LAPACKE_zggev, + LAPACKE_zggev_work, + LAPACKE_zggevx, + LAPACKE_zggevx_work, + LAPACKE_zggglm, + LAPACKE_zggglm_work, + LAPACKE_zgghrd, + LAPACKE_zgghrd_work, + LAPACKE_zgglse, + LAPACKE_zgglse_work, + LAPACKE_zggqrf, + LAPACKE_zggqrf_work, + LAPACKE_zggrqf, + LAPACKE_zggrqf_work, + LAPACKE_zggsvd, + LAPACKE_zggsvd_work, + LAPACKE_zggsvp, + LAPACKE_zggsvp_work, + LAPACKE_zgtcon, + LAPACKE_zgtcon_work, + LAPACKE_zgtrfs, + LAPACKE_zgtrfs_work, + LAPACKE_zgtsv, + LAPACKE_zgtsv_work, + LAPACKE_zgtsvx, + LAPACKE_zgtsvx_work, + LAPACKE_zgttrf, + LAPACKE_zgttrf_work, + LAPACKE_zgttrs, + LAPACKE_zgttrs_work, + LAPACKE_zhbev, + LAPACKE_zhbev_work, + LAPACKE_zhbevd, + LAPACKE_zhbevd_work, + LAPACKE_zhbevx, + LAPACKE_zhbevx_work, + LAPACKE_zhbgst, + LAPACKE_zhbgst_work, + LAPACKE_zhbgv, + LAPACKE_zhbgv_work, + LAPACKE_zhbgvd, + LAPACKE_zhbgvd_work, + LAPACKE_zhbgvx, + LAPACKE_zhbgvx_work, + LAPACKE_zhbtrd, + LAPACKE_zhbtrd_work, + LAPACKE_zhecon, + LAPACKE_zhecon_work, + LAPACKE_zheequb, + LAPACKE_zheequb_work, + LAPACKE_zheev, + LAPACKE_zheev_work, + LAPACKE_zheevd, + LAPACKE_zheevd_work, + LAPACKE_zheevr, + LAPACKE_zheevr_work, + LAPACKE_zheevx, + LAPACKE_zheevx_work, + LAPACKE_zhegst, + LAPACKE_zhegst_work, + LAPACKE_zhegv, + LAPACKE_zhegv_work, + LAPACKE_zhegvd, + LAPACKE_zhegvd_work, + LAPACKE_zhegvx, + LAPACKE_zhegvx_work, + LAPACKE_zherfs, + LAPACKE_zherfs_work, + LAPACKE_zhesv, + LAPACKE_zhesv_work, + LAPACKE_zhesvx, + LAPACKE_zhesvx_work, + LAPACKE_zheswapr, + LAPACKE_zheswapr_work, + LAPACKE_zhetrd, + LAPACKE_zhetrd_work, + LAPACKE_zhetrf, + LAPACKE_zhetrf_work, + LAPACKE_zhetri, + LAPACKE_zhetri2, + LAPACKE_zhetri2_work, + LAPACKE_zhetri2x, + LAPACKE_zhetri2x_work, + LAPACKE_zhetri_work, + LAPACKE_zhetrs, + LAPACKE_zhetrs2, + LAPACKE_zhetrs2_work, + LAPACKE_zhetrs_work, + LAPACKE_zhfrk, + LAPACKE_zhfrk_work, + LAPACKE_zhgeqz, + LAPACKE_zhgeqz_work, + LAPACKE_zhpcon, + LAPACKE_zhpcon_work, + LAPACKE_zhpev, + LAPACKE_zhpev_work, + LAPACKE_zhpevd, + LAPACKE_zhpevd_work, + LAPACKE_zhpevx, + LAPACKE_zhpevx_work, + LAPACKE_zhpgst, + LAPACKE_zhpgst_work, + LAPACKE_zhpgv, + LAPACKE_zhpgv_work, + LAPACKE_zhpgvd, + LAPACKE_zhpgvd_work, + LAPACKE_zhpgvx, + LAPACKE_zhpgvx_work, + LAPACKE_zhprfs, + LAPACKE_zhprfs_work, + LAPACKE_zhpsv, + LAPACKE_zhpsv_work, + LAPACKE_zhpsvx, + LAPACKE_zhpsvx_work, + LAPACKE_zhptrd, + LAPACKE_zhptrd_work, + LAPACKE_zhptrf, + LAPACKE_zhptrf_work, + LAPACKE_zhptri, + LAPACKE_zhptri_work, + LAPACKE_zhptrs, + LAPACKE_zhptrs_work, + LAPACKE_zhsein, + LAPACKE_zhsein_work, + LAPACKE_zhseqr, + LAPACKE_zhseqr_work, + LAPACKE_zlacgv, + LAPACKE_zlacgv_work, + LAPACKE_zlacpy, + LAPACKE_zlacpy_work, + LAPACKE_zlag2c, + LAPACKE_zlag2c_work, + LAPACKE_zlange, + LAPACKE_zlange_work, + LAPACKE_zlanhe, + LAPACKE_zlanhe_work, + LAPACKE_zlansy, + LAPACKE_zlansy_work, + LAPACKE_zlantr, + LAPACKE_zlantr_work, + LAPACKE_zlapmr, + LAPACKE_zlapmr_work, + LAPACKE_zlarfb, + LAPACKE_zlarfb_work, + LAPACKE_zlarfg, + LAPACKE_zlarfg_work, + LAPACKE_zlarft, + LAPACKE_zlarft_work, + LAPACKE_zlarfx, + LAPACKE_zlarfx_work, + LAPACKE_zlarnv, + LAPACKE_zlarnv_work, + LAPACKE_zlaset, + LAPACKE_zlaset_work, + LAPACKE_zlaswp, + LAPACKE_zlaswp_work, + LAPACKE_zlauum, + LAPACKE_zlauum_work, + LAPACKE_zpbcon, + LAPACKE_zpbcon_work, + LAPACKE_zpbequ, + LAPACKE_zpbequ_work, + LAPACKE_zpbrfs, + LAPACKE_zpbrfs_work, + LAPACKE_zpbstf, + LAPACKE_zpbstf_work, + LAPACKE_zpbsv, + LAPACKE_zpbsv_work, + LAPACKE_zpbsvx, + LAPACKE_zpbsvx_work, + LAPACKE_zpbtrf, + LAPACKE_zpbtrf_work, + LAPACKE_zpbtrs, + LAPACKE_zpbtrs_work, + LAPACKE_zpftrf, + LAPACKE_zpftrf_work, + LAPACKE_zpftri, + LAPACKE_zpftri_work, + LAPACKE_zpftrs, + LAPACKE_zpftrs_work, + LAPACKE_zpocon, + LAPACKE_zpocon_work, + LAPACKE_zpoequ, + LAPACKE_zpoequ_work, + LAPACKE_zpoequb, + LAPACKE_zpoequb_work, + LAPACKE_zporfs, + LAPACKE_zporfs_work, + LAPACKE_zposv, + LAPACKE_zposv_work, + LAPACKE_zposvx, + LAPACKE_zposvx_work, + LAPACKE_zpotrf, + LAPACKE_zpotrf_work, + LAPACKE_zpotri, + LAPACKE_zpotri_work, + LAPACKE_zpotrs, + LAPACKE_zpotrs_work, + LAPACKE_zppcon, + LAPACKE_zppcon_work, + LAPACKE_zppequ, + LAPACKE_zppequ_work, + LAPACKE_zpprfs, + LAPACKE_zpprfs_work, + LAPACKE_zppsv, + LAPACKE_zppsv_work, + LAPACKE_zppsvx, + LAPACKE_zppsvx_work, + LAPACKE_zpptrf, + LAPACKE_zpptrf_work, + LAPACKE_zpptri, + LAPACKE_zpptri_work, + LAPACKE_zpptrs, + LAPACKE_zpptrs_work, + LAPACKE_zpstrf, + LAPACKE_zpstrf_work, + LAPACKE_zptcon, + LAPACKE_zptcon_work, + LAPACKE_zpteqr, + LAPACKE_zpteqr_work, + LAPACKE_zptrfs, + LAPACKE_zptrfs_work, + LAPACKE_zptsv, + LAPACKE_zptsv_work, + LAPACKE_zptsvx, + LAPACKE_zptsvx_work, + LAPACKE_zpttrf, + LAPACKE_zpttrf_work, + LAPACKE_zpttrs, + LAPACKE_zpttrs_work, + LAPACKE_zspcon, + LAPACKE_zspcon_work, + LAPACKE_zsprfs, + LAPACKE_zsprfs_work, + LAPACKE_zspsv, + LAPACKE_zspsv_work, + LAPACKE_zspsvx, + LAPACKE_zspsvx_work, + LAPACKE_zsptrf, + LAPACKE_zsptrf_work, + LAPACKE_zsptri, + LAPACKE_zsptri_work, + LAPACKE_zsptrs, + LAPACKE_zsptrs_work, + LAPACKE_zstedc, + LAPACKE_zstedc_work, + LAPACKE_zstegr, + LAPACKE_zstegr_work, + LAPACKE_zstein, + LAPACKE_zstein_work, + LAPACKE_zstemr, + LAPACKE_zstemr_work, + LAPACKE_zsteqr, + LAPACKE_zsteqr_work, + LAPACKE_zsycon, + LAPACKE_zsycon_work, + LAPACKE_zsyconv, + LAPACKE_zsyconv_work, + LAPACKE_zsyequb, + LAPACKE_zsyequb_work, + LAPACKE_zsyrfs, + LAPACKE_zsyrfs_work, + LAPACKE_zsysv, + LAPACKE_zsysv_work, + LAPACKE_zsysvx, + LAPACKE_zsysvx_work, + LAPACKE_zsyswapr, + LAPACKE_zsyswapr_work, + LAPACKE_zsytrf, + LAPACKE_zsytrf_work, + LAPACKE_zsytri, + LAPACKE_zsytri2, + LAPACKE_zsytri2_work, + LAPACKE_zsytri2x, + LAPACKE_zsytri2x_work, + LAPACKE_zsytri_work, + LAPACKE_zsytrs, + LAPACKE_zsytrs2, + LAPACKE_zsytrs2_work, + LAPACKE_zsytrs_work, + LAPACKE_ztbcon, + LAPACKE_ztbcon_work, + LAPACKE_ztbrfs, + LAPACKE_ztbrfs_work, + LAPACKE_ztbtrs, + LAPACKE_ztbtrs_work, + LAPACKE_ztfsm, + LAPACKE_ztfsm_work, + LAPACKE_ztftri, + LAPACKE_ztftri_work, + LAPACKE_ztfttp, + LAPACKE_ztfttp_work, + LAPACKE_ztfttr, + LAPACKE_ztfttr_work, + LAPACKE_ztgevc, + LAPACKE_ztgevc_work, + LAPACKE_ztgexc, + LAPACKE_ztgexc_work, + LAPACKE_ztgsen, + LAPACKE_ztgsen_work, + LAPACKE_ztgsja, + LAPACKE_ztgsja_work, + LAPACKE_ztgsna, + LAPACKE_ztgsna_work, + LAPACKE_ztgsyl, + LAPACKE_ztgsyl_work, + LAPACKE_ztpcon, + LAPACKE_ztpcon_work, + LAPACKE_ztpmqrt, + LAPACKE_ztpmqrt_work, + LAPACKE_ztpqrt, + LAPACKE_ztpqrt2, + LAPACKE_ztpqrt2_work, + LAPACKE_ztpqrt_work, + LAPACKE_ztprfb, + LAPACKE_ztprfb_work, + LAPACKE_ztprfs, + LAPACKE_ztprfs_work, + LAPACKE_ztptri, + LAPACKE_ztptri_work, + LAPACKE_ztptrs, + LAPACKE_ztptrs_work, + LAPACKE_ztpttf, + LAPACKE_ztpttf_work, + LAPACKE_ztpttr, + LAPACKE_ztpttr_work, + LAPACKE_ztrcon, + LAPACKE_ztrcon_work, + LAPACKE_ztrevc, + LAPACKE_ztrevc_work, + LAPACKE_ztrexc, + LAPACKE_ztrexc_work, + LAPACKE_ztrrfs, + LAPACKE_ztrrfs_work, + LAPACKE_ztrsen, + LAPACKE_ztrsen_work, + LAPACKE_ztrsna, + LAPACKE_ztrsna_work, + LAPACKE_ztrsyl, + LAPACKE_ztrsyl_work, + LAPACKE_ztrtri, + LAPACKE_ztrtri_work, + LAPACKE_ztrtrs, + LAPACKE_ztrtrs_work, + LAPACKE_ztrttf, + LAPACKE_ztrttf_work, + LAPACKE_ztrttp, + LAPACKE_ztrttp_work, + LAPACKE_ztzrzf, + LAPACKE_ztzrzf_work, + LAPACKE_zunbdb, + LAPACKE_zunbdb_work, + LAPACKE_zuncsd, + LAPACKE_zuncsd_work, + LAPACKE_zungbr, + LAPACKE_zungbr_work, + LAPACKE_zunghr, + LAPACKE_zunghr_work, + LAPACKE_zunglq, + LAPACKE_zunglq_work, + LAPACKE_zungql, + LAPACKE_zungql_work, + LAPACKE_zungqr, + LAPACKE_zungqr_work, + LAPACKE_zungrq, + LAPACKE_zungrq_work, + LAPACKE_zungtr, + LAPACKE_zungtr_work, + LAPACKE_zunmbr, + LAPACKE_zunmbr_work, + LAPACKE_zunmhr, + LAPACKE_zunmhr_work, + LAPACKE_zunmlq, + LAPACKE_zunmlq_work, + LAPACKE_zunmql, + LAPACKE_zunmql_work, + LAPACKE_zunmqr, + LAPACKE_zunmqr_work, + LAPACKE_zunmrq, + LAPACKE_zunmrq_work, + LAPACKE_zunmrz, + LAPACKE_zunmrz_work, + LAPACKE_zunmtr, + LAPACKE_zunmtr_work, + LAPACKE_zupgtr, + LAPACKE_zupgtr_work, + LAPACKE_zupmtr, + LAPACKE_zupmtr_work, + LAPACKE_zsyr, + LAPACKE_csyr, + LAPACKE_zsyr_work, + LAPACKE_csyr_work, + + ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the + ## corresponding LAPACK extended precision routines. + #LAPACKE_cgbrfsx, + #LAPACKE_cporfsx, + #LAPACKE_dgerfsx, + #LAPACKE_sgbrfsx, + #LAPACKE_ssyrfsx, + #LAPACKE_zherfsx, + #LAPACKE_cgbrfsx_work, + #LAPACKE_cporfsx_work, + #LAPACKE_dgerfsx_work, + #LAPACKE_sgbrfsx_work, + #LAPACKE_ssyrfsx_work, + #LAPACKE_zherfsx_work, + #LAPACKE_cgerfsx, + #LAPACKE_csyrfsx, + #LAPACKE_dporfsx, + #LAPACKE_sgerfsx, + #LAPACKE_zgbrfsx, + #LAPACKE_zporfsx, + #LAPACKE_cgerfsx_work, + #LAPACKE_csyrfsx_work, + #LAPACKE_dporfsx_work, + #LAPACKE_sgerfsx_work, + #LAPACKE_zgbrfsx_work, + #LAPACKE_zporfsx_work, + #LAPACKE_cherfsx, + #LAPACKE_dgbrfsx, + #LAPACKE_dsyrfsx, + #LAPACKE_sporfsx, + #LAPACKE_zgerfsx, + #LAPACKE_zsyrfsx, + #LAPACKE_cherfsx_work, + #LAPACKE_dgbrfsx_work, + #LAPACKE_dsyrfsx_work, + #LAPACKE_sporfsx_work, + #LAPACKE_zgerfsx_work, + #LAPACKE_zsyrfsx_work, + #LAPACKE_cgbsvxx, + #LAPACKE_cposvxx, + #LAPACKE_dgesvxx, + #LAPACKE_sgbsvxx, + #LAPACKE_ssysvxx, + #LAPACKE_zhesvxx, + #LAPACKE_cgbsvxx_work, + #LAPACKE_cposvxx_work, + #LAPACKE_dgesvxx_work, + #LAPACKE_sgbsvxx_work, + #LAPACKE_ssysvxx_work, + #LAPACKE_zhesvxx_work, + #LAPACKE_cgesvxx, + #LAPACKE_csysvxx, + #LAPACKE_dposvxx, + #LAPACKE_sgesvxx, + #LAPACKE_zgbsvxx, + #LAPACKE_zposvxx, + #LAPACKE_cgesvxx_work, + #LAPACKE_csysvxx_work, + #LAPACKE_dposvxx_work, + #LAPACKE_sgesvxx_work, + #LAPACKE_zgbsvxx_work, + #LAPACKE_zposvxx_work, + #LAPACKE_chesvxx, + #LAPACKE_dgbsvxx, + #LAPACKE_dsysvxx, + #LAPACKE_sposvxx, + #LAPACKE_zgesvxx, + #LAPACKE_zsysvxx, + #LAPACKE_chesvxx_work, + #LAPACKE_dgbsvxx_work, + #LAPACKE_dsysvxx_work, + #LAPACKE_sposvxx_work, + #LAPACKE_zgesvxx_work, + #LAPACKE_zsysvxx_work, + + ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg + ## (see `lapack-3.4.1/TESTING/MATGEN`). + #LAPACKE_clatms, + #LAPACKE_clatms_work, + #LAPACKE_dlatms, + #LAPACKE_dlatms_work, + #LAPACKE_slatms, + #LAPACKE_slatms_work, + #LAPACKE_zlatms, + #LAPACKE_zlatms_work, + #LAPACKE_clagge, + #LAPACKE_clagge_work, + #LAPACKE_dlagge, + #LAPACKE_dlagge_work, + #LAPACKE_slagge, + #LAPACKE_slagge_work, + #LAPACKE_zlagge, + #LAPACKE_zlagge_work, + #LAPACKE_claghe, + #LAPACKE_claghe_work, + #LAPACKE_zlaghe, + #LAPACKE_zlaghe_work, + #LAPACKE_clagsy, + #LAPACKE_clagsy_work, + #LAPACKE_dlagsy, + #LAPACKE_dlagsy_work, + #LAPACKE_slagsy, + #LAPACKE_slagsy_work, + #LAPACKE_zlagsy, + #LAPACKE_zlagsy_work, ); if ($ARGV[5] == 1) { #NO_LAPACK=1 - @objs = (@blasobjs); -} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") { - @objs = (@blasobjs, @lapackobjs, @lapackobjs2); + @underscore_objs = (@blasobjs, @misc_underscore_objs); +} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" || + -d "../lapack-3.4.2") { + @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); } else { - @objs = (@blasobjs, @lapackobjs); + @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); } -if ($ARGV[3] == 1){ @objs = (@objs, @exblasobjs); }; +if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; -if ($ARGV[1] eq "X86_64"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[1] eq "X86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; -if ($ARGV[1] eq "x86"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; -if ($ARGV[1] eq "ia64"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; -if ($ARGV[1] eq "MIPS"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; + +if ($ARGV[4] == 0) { + @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); +}else{ + #NO_CBLAS=1 + @no_underscore_objs = (@misc_no_underscore_objs); +} +if ($ARGV[6] == 1) { + #NO_LAPACKE=1 + @no_underscore_objs = (@no_underscore_objs); +} else { + @no_underscore_objs = (@no_underscore_objs, @lapackeobjs); +} @linuxobjs = ('__strtol_internal', 'exit', 'free', 'getenv', 'malloc', 'mmap', 'printf', 'sqrt', @@ -333,15 +2716,15 @@ $bu = $ARGV[2]; $bu = "" if (($bu eq "0") || ($bu eq "1")); if ($ARGV[0] eq "linux"){ - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print $objs, $bu, "\n"; } - if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { +# if ($ARGV[4] == 0) { + foreach $objs (@no_underscore_objs) { print $objs, "\n"; } - } +# } foreach $objs (@linuxobjs) { print $objs, "\n"; @@ -350,50 +2733,55 @@ if ($ARGV[0] eq "linux"){ } if ($ARGV[0] eq "osx"){ - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print "_", $objs, $bu, "\n"; } - if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { +# if ($ARGV[4] == 0) { + foreach $objs (@no_underscore_objs) { print "_", $objs, "\n"; } - } +# } exit(0); } if ($ARGV[0] eq "aix"){ - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print $objs, $bu, "\n"; } - if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { +# if ($ARGV[4] == 0) { + foreach $objs (@no_underscore_objs) { print $objs, "\n"; } - } +# } exit(0); } if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; - foreach $objs (@objs) { - $uppercase = $objs; - $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t$objs=$objs","_ \@", $count, "\n"; - $count ++; - print "\t",$objs, "_=$objs","_ \@", $count, "\n"; - $count ++; - print "\t$uppercase=$objs", "_ \@", $count, "\n"; - $count ++; + + foreach $objs (@underscore_objs) { + unless ($objs =~ /openblas_set_num_threads/) { #remove openblas_set_num_threads + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs=$objs","_ \@", $count, "\n"; + $count ++; + print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + $count ++; + print "\t$uppercase=$objs", "_ \@", $count, "\n"; + $count ++; + } } - - if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { - print "\t",$objs,"=$objs"," \@", $count, "\n"; - $count ++; - } + + #for openblas_set_num_threads + print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n"; + $count ++; + + foreach $objs (@no_underscore_objs) { + print "\t",$objs,"=$objs"," \@", $count, "\n"; + $count ++; } exit(0); @@ -424,7 +2812,7 @@ if ($ARGV[0] eq "win2khpl"){ if ($ARGV[0] eq "microsoft"){ print "EXPORTS\n"; $count = 1; - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t$objs = $objs","_\n"; @@ -441,7 +2829,7 @@ if ($ARGV[0] eq "microsoft"){ if ($ARGV[0] eq "win2kasm"){ print "\t.text\n"; - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t.align 16\n"; @@ -454,14 +2842,14 @@ if ($ARGV[0] eq "win2kasm"){ if ($ARGV[0] eq "linktest"){ print "int main(void){\n"; - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print $objs, $bu, "();\n" if $objs ne "xerbla"; } - if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { +# if ($ARGV[4] == 0) { + foreach $objs (@no_underscore_objs) { print $objs, "();\n"; } - } +# } diff --git a/f_check b/f_check index f5bb5a7f6..83587e609 100644 --- a/f_check +++ b/f_check @@ -32,11 +32,12 @@ if ($compiler eq "") { "pgf95", "pgf90", "pgf77", "ifort"); +OUTER: foreach $lists (@lists) { foreach $path (@path) { - if (-f $path . "/" . $lists) { + if (-x $path . "/" . $lists) { $compiler = $lists; - break; + last OUTER; } } } @@ -210,6 +211,10 @@ if (!$?) { if ($?) { $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } + #For gfortran MIPS + if ($?) { + $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } $binary = "" if ($?); } @@ -218,6 +223,10 @@ if (!$?) { if ($?) { $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } + #For gfortran MIPS + if ($?) { + $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } $binary = "" if ($?); } @@ -237,6 +246,8 @@ if ($link ne "") { $link =~ s/\-rpath\s+/\-rpath\@/g; @flags = split(/[\s\,\n]/, $link); + # remove leading and trailing quotes from each flag. + @flags = map {s/^['"]|['"]$//g; $_} @flags; foreach $flags (@flags) { if ( diff --git a/getarch.c b/getarch.c index 5b614472a..ac10f1cd5 100644 --- a/getarch.c +++ b/getarch.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -96,12 +96,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PENRYN */ /* #define FORCE_DUNNINGTON */ /* #define FORCE_NEHALEM */ +/* #define FORCE_SANDYBRIDGE */ +/* #define FORCE_ATOM */ /* #define FORCE_ATHLON */ /* #define FORCE_OPTERON */ /* #define FORCE_OPTERON_SSE3 */ /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ +/* #define FORCE_BOBCAT */ +/* #define FORCE_BULLDOZER */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ @@ -116,12 +120,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3A */ +/* #define FORCE_LOONGSON3B */ /* #define FORCE_ITANIUM2 */ -/* #define FORCE_GENERIC */ /* #define FORCE_SPARC */ /* #define FORCE_SPARCV7 */ +/* #define FORCE_GENERIC */ #ifdef FORCE_P2 #define FORCE @@ -137,20 +141,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "P5" #endif -#ifdef FORCE_COPPERMINE -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" -#define SUBARCHITECTURE "PENTIUM3" -#define ARCHCONFIG "-DPENTIUM3 " \ - "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " -#define LIBNAME "coppermine" -#define CORENAME "COPPERMINE" -#endif - #ifdef FORCE_KATMAI #define FORCE #define FORCE_INTEL @@ -165,6 +155,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "KATMAI" #endif +#ifdef FORCE_COPPERMINE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM3" +#define ARCHCONFIG "-DPENTIUM3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "coppermine" +#define CORENAME "COPPERMINE" +#endif + #ifdef FORCE_NORTHWOOD #define FORCE #define FORCE_INTEL @@ -278,6 +282,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "NEHALEM" #endif +#ifdef FORCE_SANDYBRIDGE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL @@ -342,13 +360,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DBARCELONA " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ - "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ - "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" #define LIBNAME "barcelona" #define CORENAME "BARCELONA" #endif +#if defined(FORCE_BOBCAT) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BOBCAT" +#define ARCHCONFIG "-DBOBCAT " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" +#define LIBNAME "bobcat" +#define CORENAME "BOBCAT" +#endif + +#if defined (FORCE_BULLDOZER) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BULLDOZER" +#define ARCHCONFIG "-DBULLDOZER " \ + "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ + "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ + "-DHAVE_AVX -DHAVE_FMA4" +#define LIBNAME "bulldozer" +#define CORENAME "BULLDOZER" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/getarch_2nd.c b/getarch_2nd.c index 5339af442..4bdd16a99 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -34,7 +34,7 @@ int main(int argc, char **argv) { #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif - printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); + printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); } return 0; diff --git a/interface/Makefile b/interface/Makefile index 5cf11cd9b..93892206f 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -318,7 +318,7 @@ CZBLAS3OBJS = \ ifndef NO_CBLAS -CFLAGS += -I. +override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) @@ -400,7 +400,7 @@ all :: libs ifdef FUNCTION_PROFILE $(BLASOBJS) $(BLASOBJS_P) : functable.h -$(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) +$(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) functable.h : Makefile ./create $(FUNCALLFILES) > functable.h @@ -420,7 +420,7 @@ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ -$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS +$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/kernel/Makefile b/kernel/Makefile index aed145b60..55edcd287 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,7 +6,7 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system ifdef TARGET_CORE -CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) +override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) BUILD_KERNEL = 1 KDIR = TSUFFIX = _$(TARGET_CORE) @@ -48,7 +48,7 @@ HPLOBJS = \ COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX) endif diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c new file mode 100644 index 000000000..839bd5939 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -0,0 +1,235 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2,*dest4; + ii = col&-8; + ii = ii*(2*row); + dest4 = dest+ii; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j> 7); @@ -746,6 +746,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SANDYBRIDGE + +#ifdef DEBUG + fprintf(stderr, "Sandybridge\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON #ifdef DEBUG @@ -778,6 +794,38 @@ static void init_parameter(void) { #endif #endif +#ifdef BOBCAT + +#ifdef DEBUG + fprintf(stderr, "Bobcate\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + +#ifdef BULLDOZER + +#ifdef DEBUG + fprintf(stderr, "Bulldozer\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86/KERNEL.BOBCAT b/kernel/x86/KERNEL.BOBCAT new file mode 100644 index 000000000..231350a62 --- /dev/null +++ b/kernel/x86/KERNEL.BOBCAT @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86/KERNEL.BULLDOZER b/kernel/x86/KERNEL.BULLDOZER new file mode 100644 index 000000000..231350a62 --- /dev/null +++ b/kernel/x86/KERNEL.BULLDOZER @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86/KERNEL.SANDYBRIDGE b/kernel/x86/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..65b03ae50 --- /dev/null +++ b/kernel/x86/KERNEL.SANDYBRIDGE @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/amax_sse.S b/kernel/x86/amax_sse.S index 65792cf45..05d21a7eb 100644 --- a/kernel/x86/amax_sse.S +++ b/kernel/x86/amax_sse.S @@ -495,7 +495,6 @@ ALIGN_4 .L999: - RESTOREREGISTERS subl $8, %esp movss %xmm0, (%esp) diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S index 263aea042..0bdc9185c 100644 --- a/kernel/x86/gemm_kernel_2x4_penryn.S +++ b/kernel/x86/gemm_kernel_2x4_penryn.S @@ -76,6 +76,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S index 18b9a43bd..f081aec2a 100644 --- a/kernel/x86/gemm_kernel_4x4_barcelona.S +++ b/kernel/x86/gemm_kernel_4x4_barcelona.S @@ -596,7 +596,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -842,7 +842,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1168,7 +1168,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1198,7 +1198,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1347,7 +1347,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1531,7 +1531,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1778,7 +1778,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1793,7 +1793,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1924,7 +1924,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S index 6775d1d18..2d51d9711 100644 --- a/kernel/x86/gemm_kernel_4x4_penryn.S +++ b/kernel/x86/gemm_kernel_4x4_penryn.S @@ -69,6 +69,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif @@ -262,7 +268,7 @@ movaps -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 -#ifndef NEHALEM +#if !(defined(NEHALEM) || defined(SANDYBRIDGE)) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) #endif pshufd $0x93, %xmm1, %xmm2 diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index aae49a22d..3ff9203c8 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) @@ -89,17 +89,22 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) +#define LDAX 12+ARGS(%esp) #define I %eax #define J %ebx @@ -114,6 +119,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -121,7 +127,34 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $21,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y movl STACK_LDA, LDA + movl STACK_X, X movl STACK_INCX, INCX @@ -651,12 +684,22 @@ addss 0 * SIZE(X), %xmm0 movss %xmm0, (Y1) ALIGN_3 - .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 669c5ac6c..980797d91 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) @@ -76,17 +76,22 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) #define I %eax #define J %ebx @@ -101,6 +106,8 @@ PROLOGUE + + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -108,6 +115,33 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $20,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y + movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX @@ -677,10 +711,22 @@ ALIGN_3 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: + popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index a4990116d..326584bbc 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) @@ -89,17 +89,24 @@ #endif #define STACKSIZE 16 +#define ARGS 20 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define NN 4+STACKSIZE(%esp) +#define AA 8+STACKSIZE(%esp) +#define LDAX 12+STACKSIZE(%esp) +#define XX 16+STACKSIZE(%esp) #define I %eax #define J %ebx @@ -114,6 +121,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -122,7 +130,42 @@ PROFCODE movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA movl STACK_X, X + movl X,XX + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) + subl $8, J # Don't use last 8 float in the buffer. + # Now, split M by block J + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA + movl XX,X + movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -198,6 +241,20 @@ jg .L06 ALIGN_4 +//Padding zero to prevent loading the dirty number from buffer. + movl M, I + movl $8, J + andl $7, I + xorps %xmm0, %xmm0 + subl I, J + ALIGN_2 +.L07: + movss %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl J + jg .L07 + ALIGN_4 + .L10: movl Y, Y1 @@ -628,10 +685,22 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl XX,J + addl %eax,J + movl J,XX + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index 9960b5c0c..60d6ef270 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) @@ -76,18 +76,24 @@ #endif #define STACKSIZE 16 +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define AA 4+STACKSIZE(%esp) +#define LDAX 8+STACKSIZE(%esp) +#define NN 12+STACKSIZE(%esp) -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) - #define I %eax #define J %ebx @@ -101,6 +107,8 @@ PROLOGUE + subl $ARGS,%esp + pushl %ebp pushl %edi pushl %esi @@ -108,7 +116,40 @@ PROFCODE + movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) + subl $4, J # Don't use last 4 double in the buffer. + # Now, split M by block J + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -117,6 +158,7 @@ leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA + subl $-16 * SIZE, A cmpl $0, N @@ -560,10 +602,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S index aa5ab760e..48edfc585 100644 --- a/kernel/x86/scal_sse.S +++ b/kernel/x86/scal_sse.S @@ -269,7 +269,7 @@ sarl $5, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S index dab543470..35b79132c 100644 --- a/kernel/x86/scal_sse2.S +++ b/kernel/x86/scal_sse2.S @@ -76,7 +76,8 @@ xorps %xmm1, %xmm1 comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO - + jp .L100 # For Alpha = NaN + /* Alpha == ZERO */ cmpl $SIZE, INCX jne .L50 @@ -252,7 +253,7 @@ sarl $4, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 6645b790e..ebd1377f1 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index 9a7a466a6..036e17338 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index bb33918ef..6fa7d410e 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 147ed19bd..84da443a8 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 55c69e49f..9ce4cd8d4 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index e4f59819b..0bd924cba 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 11cc104e2..a1a35a7a5 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 8d6189865..de7c04593 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 01876a515..a5333640d 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index 6c2682a10..f5d5ad465 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 40a9604d3..c3619ec3d 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 0d2fcb6d2..5c2dcd0d6 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S index efebe637b..61e1bfc27 100644 --- a/kernel/x86/zdot_sse2.S +++ b/kernel/x86/zdot_sse2.S @@ -1541,6 +1541,16 @@ popl %ebx popl %esi popl %edi -/*remove the hidden return value address from the stack.*/ +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) +#ifdef MS_ABI +/* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */ + ret +#else +/* remove the hidden return value address from the stack. For MingW GCC < 4.7 */ ret $0x4 +#endif +#else +/*remove the hidden return value address from the stack on Linux.*/ + ret $0x4 +#endif EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S index 29158df25..623f0beec 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S +++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S @@ -74,7 +74,7 @@ #define BB %ecx #define LDC %ebp -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define movsd movlps #endif @@ -625,7 +625,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -870,7 +870,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1173,7 +1173,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1203,7 +1203,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1359,7 +1359,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1536,7 +1536,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1794,7 +1794,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1809,7 +1809,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1936,7 +1936,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S index 849361956..70b38dc79 100644 --- a/kernel/x86/zgemm_kernel_1x2_penryn.S +++ b/kernel/x86/zgemm_kernel_1x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S index edd89b112..715eb4d4f 100644 --- a/kernel/x86/zgemm_kernel_2x2_penryn.S +++ b/kernel/x86/zgemm_kernel_2x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 340b9d375..0087ac6f4 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index 441fbb0c0..f0f2dc0ec 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index 4312ed173..c7ad91235 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index 78ca14cab..6c4842893 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index 849d787f6..53abb697b 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -55,7 +55,7 @@ #define XX %edi #define FLAG %ebp -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -697,7 +697,7 @@ cmpl $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index 5b1da61e6..26ef693a0 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -57,7 +57,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -860,7 +860,7 @@ cmpl $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index e5949aa6e..53e53c3ce 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index f77a06d6c..d32451574 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index b01498f78..3c056cdff 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 3668ee2bb..1efa1fd25 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index 84d40ddec..9f9449852 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 13064166f..849afed73 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index ebff425c0..c1833abe2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index bce0b0252..dd0c5ab21 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/KERNEL.BOBCAT b/kernel/x86_64/KERNEL.BOBCAT new file mode 100644 index 000000000..051a52286 --- /dev/null +++ b/kernel/x86_64/KERNEL.BOBCAT @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER new file mode 100644 index 000000000..d59668519 --- /dev/null +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..c321be752 --- /dev/null +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -0,0 +1,84 @@ +SGEMMKERNEL = sgemm_kernel_8x8_sandy.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = dgemm_kernel_4x8_sandy.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +#DGEMMONCOPY = gemm_ncopy_4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +#DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMKERNEL = cgemm_kernel_4x8_sandy.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S index 9a7512575..2a9e928ed 100644 --- a/kernel/x86_64/axpy_sse.S +++ b/kernel/x86_64/axpy_sse.S @@ -69,7 +69,7 @@ #endif movaps %xmm0, ALPHA #else - movaps %xmm3, ALPHA + movq 40(%rsp), X movq 48(%rsp), INCX @@ -79,6 +79,10 @@ SAVEREGISTERS +#ifdef WINDOWS_ABI + movaps %xmm3, ALPHA +#endif + shufps $0, ALPHA, ALPHA leaq (, INCX, SIZE), INCX diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S index dea8d0382..45c7b0380 100644 --- a/kernel/x86_64/axpy_sse2.S +++ b/kernel/x86_64/axpy_sse2.S @@ -69,7 +69,6 @@ #endif movaps %xmm0, ALPHA #else - movaps %xmm3, ALPHA movq 40(%rsp), X movq 48(%rsp), INCX @@ -79,6 +78,10 @@ SAVEREGISTERS +#ifdef WINDOWS_ABI + movaps %xmm3, ALPHA +#endif + unpcklpd ALPHA, ALPHA leaq (, INCX, SIZE), INCX diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S new file mode 100644 index 000000000..5a5588089 --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -0,0 +1,4458 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define old_bm %rdi +#define old_bn %rsi +#define old_bk %rdx + +#define bm %r13 +#define bn %r14 +#define bk %r15 + +#define ALPHA %xmm0 +#define ba %rcx +#define bb %r8 +#define C %r9 +#define ldc %r10 + +#define i %r11 +#define k %rax + +#define ptrba %rdi +#define ptrbb %rsi +#define C0 %rbx +#define C1 %rbp + +#define prebb %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define old_ldc 8+STACKSIZE(%rsp) +#define old_offset 16+STACKSIZE(%rsp) + +#define MEMALPHA_R 48(%rsp) +#define MEMALPHA_I 56(%rsp) +#define j 64(%rsp) +#define OFFSET 72(%rsp) +#define kk 80(%rsp) +#define kkk 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define old_ldc 72 + STACKSIZE(%rsp) +#define old_offset 80 + STACKSIZE(%rsp) + +#define MEMALPHA_R 224(%rsp) +#define MEMALPHA_I 232(%rsp) +#define j 240(%rsp) +#define OFFSET 248(%rsp) +#define kk 256(%rsp) +#define kkk 264(%rsp) + +#endif + +#define PREFETCH0 prefetcht0 +#define PREFETCH1 prefetcht0 +#define PREFETCH2 prefetcht0 +#define PRESIZE 64 + +#define xvec0 %xmm0 +#define xvec1 %xmm1 +#define xvec2 %xmm2 +#define xvec3 %xmm3 +#define xvec4 %xmm4 +#define xvec5 %xmm5 +#define xvec6 %xmm6 +#define xvec7 %xmm7 +#define xvec8 %xmm8 +#define xvec9 %xmm9 +#define xvec10 %xmm10 +#define xvec11 %xmm11 +#define xvec12 %xmm12 +#define xvec13 %xmm13 +#define xvec14 %xmm14 +#define xvec15 %xmm15 + +#define yvec0 %ymm0 +#define yvec1 %ymm1 +#define yvec2 %ymm2 +#define yvec3 %ymm3 +#define yvec4 %ymm4 +#define yvec5 %ymm5 +#define yvec6 %ymm6 +#define yvec7 %ymm7 +#define yvec8 %ymm8 +#define yvec9 %ymm9 +#define yvec10 %ymm10 +#define yvec11 %ymm11 +#define yvec12 %ymm12 +#define yvec13 %ymm13 +#define yvec14 %ymm14 +#define yvec15 %ymm15 + +#define LEAQ leaq +#define ADDQ addq +#define MULQ imulq +#define SARQ sarq +#define SALQ salq +#define ANDQ andq +#define SUBQ subq +#define DECQ decq +#define JG jg +#define JLE jle +#define TEST testq +#define OR orq +#define JNE jne +#define JMP jmp +#define NOP +#define XOR xorpd +#undef MOVQ +#define MOVQ movq + +#define XOR_SY vxorps +#define XOR_SX vxorps + +#define LD_SY vmovaps +#define LD_SX vmovaps +#define LDL_SX vmovlps +#define LDL_SY vmovlps +#define LDH_SX vmovhps +#define LDH_SY vmovhps + +#define ST_SY vmovaps +#define ST_SX vmovaps +#define STL_SX vmovlps +#define STL_SY vmovlps +#define STH_SX vmovhps +#define STH_SY vmovhps + +#define EDUP_SY vmovsldup +#define ODUP_SY vmovshdup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup + +#define ADD_SY vaddps +#define ADD_SX vaddps +#define SUB_SY vsubps +#define SUB_SX vsubps + +#define ADDSUB_SY vaddsubps +#define ADDSUB_SX vaddsubps + +#define MUL_SY vmulps +#define MUL_SX vmulps + +#define SHUF_SY vperm2f128 +#define SHUF_SX vpshufd + +#define VPERMILP_SY vpermilps +#define VPERMILP_SX vpermilps + +#define BROAD_SY vbroadcastss +#define BROAD_SX vbroadcastss + +#define MOV_SY vmovaps +#define MOV_SX vmovaps + +#define REVS_SY vshufps +#define REVS_SX vshufps + +#define EXTRA_SY vextractf128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1_SY ADD_SY +#define ADD2_SY ADDSUB_SY +#define ADD1_SX ADD_SX +#define ADD2_SX ADDSUB_SX +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1_SY SUB_SY +#define ADD2_SY ADDSUB_SY +#define ADD1_SX SUB_SX +#define ADD2_SX ADDSUB_SX +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1_SY SUB_SY +#define ADD2_SY ADDSUB_SY +#define ADD1_SX SUB_SX +#define ADD2_SX ADDSUB_SX +#else +#define ADD1_SY ADD_SY +#define ADD2_SY ADDSUB_SY +#define ADD1_SX ADD_SX +#define ADD2_SX ADDSUB_SX +#endif + +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, old_bm + movq ARG2, old_bn + movq ARG3, old_bk + movq OLD_A, ba + movq OLD_B, bb + movq OLD_C, C + movq old_ldc, ldc +#ifdef TRMMKERNEL + movq old_offset, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else +movq old_ldc, ldc +#ifdef TRMMKERNEL +movq old_offset, %r11; +#endif +#endif + +vzeroupper + +vmovlps %xmm0, MEMALPHA_R +vmovlps %xmm1, MEMALPHA_I +movq old_bm, bm +movq old_bn, bn +movq old_bk, bk +salq $ZBASE_SHIFT, ldc +#ifdef TRMMKERNEL +movq %r11, OFFSET +#ifndef LEFT +negq %r11; +#endif +movq %r11, kk; +#endif + +MOVQ bn,j; +SARQ $2,j; # Rn = 4 +JLE .L0_loopE; +ALIGN_5; +.L0_bodyB:; +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ C,C0; +LEAQ (C,ldc,2),C1; +MOVQ bk, k; +SALQ $5, k; +LEAQ (bb, k, 1), prebb; # Rn=4, SIZE=4 COMPLEX=2 +MOVQ ba,ptrba; +MOVQ bm,i; +SARQ $3,i; # Rm = 8 +JLE .L1_loopE; +ALIGN_5; +.L1_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +# Initial results register +PREFETCH0 0*SIZE(prebb); +XOR_SY yvec15, yvec15, yvec15; +PREFETCH0 16*SIZE(prebb); +ADDQ $32*SIZE, prebb; +XOR_SY yvec14, yvec14, yvec14; +PREFETCH2 3*SIZE(C0); +XOR_SY yvec13, yvec13, yvec13; +PREFETCH2 3*SIZE(C0, ldc, 1); +XOR_SY yvec12, yvec12, yvec12; +PREFETCH2 3*SIZE(C1); +EDUP_SY 0*SIZE(ptrbb), yvec2; # Br0, Br1, Br2, Br3 +PREFETCH2 3*SIZE(C1, ldc, 1); +XOR_SY yvec11, yvec11, yvec11; +XOR_SY yvec10, yvec10, yvec10; +LD_SY 0*SIZE(ptrba), yvec0; # Ar0, Ai0, Ar1, Ai1.. +XOR_SY yvec9, yvec9, yvec9; +XOR_SY yvec8, yvec8, yvec8; +VPERMILP_SY $0x4e, yvec2, yvec3; # Br2, Br3, Br0, Br1 +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2,k; # Unroll 4 times +JLE .L2_loopE; +ALIGN_5; +.L2_bodyB:; +# Computing kernel + +######### Unroll 1 ################## +PREFETCH0 PRESIZE*SIZE(ptrba); +LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec15, yvec15; +ADD1_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 +ADD1_SY yvec6, yvec14, yvec14; +ADD1_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. +ADD1_SY yvec6, yvec11, yvec11; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec1, yvec5, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec10, yvec10; +ADD1_SY yvec7, yvec8, yvec8; + +VPERMILP_SY $0xb1, yvec1, yvec1; +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +ADD2_SY yvec6, yvec15, yvec15; +ADD2_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +EDUP_SY 8*SIZE(ptrbb), yvec2; +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; +ADD2_SY yvec6, yvec14, yvec14; +ADD2_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +LD_SY 16*SIZE(ptrba), yvec0; +ADD2_SY yvec6, yvec11, yvec11; +ADD2_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec6, yvec10, yvec10; +ADD2_SY yvec7, yvec8, yvec8; + +######### Unroll 2 ################## +PREFETCH0 (PRESIZE+16)*SIZE(ptrba); +LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec15, yvec15; +ADD1_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 +ADD1_SY yvec6, yvec14, yvec14; +ADD1_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. +ADD1_SY yvec6, yvec11, yvec11; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec1, yvec5, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec10, yvec10; +ADD1_SY yvec7, yvec8, yvec8; + +VPERMILP_SY $0xb1, yvec1, yvec1; +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +ADD2_SY yvec6, yvec15, yvec15; +ADD2_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +EDUP_SY 16*SIZE(ptrbb), yvec2; +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; +ADD2_SY yvec6, yvec14, yvec14; +ADD2_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +LD_SY 32*SIZE(ptrba), yvec0; +ADD2_SY yvec6, yvec11, yvec11; +ADD2_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec6, yvec10, yvec10; +ADD2_SY yvec7, yvec8, yvec8; + +######### Unroll 3 ################## +PREFETCH0 (PRESIZE+32)*SIZE(ptrba); +LD_SY 40*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec15, yvec15; +ADD1_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +ODUP_SY 16*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 +ADD1_SY yvec6, yvec14, yvec14; +ADD1_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. +ADD1_SY yvec6, yvec11, yvec11; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec1, yvec5, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec10, yvec10; +ADD1_SY yvec7, yvec8, yvec8; + +VPERMILP_SY $0xb1, yvec1, yvec1; +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +ADD2_SY yvec6, yvec15, yvec15; +ADD2_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +EDUP_SY 24*SIZE(ptrbb), yvec2; +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; +ADD2_SY yvec6, yvec14, yvec14; +ADD2_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +LD_SY 48*SIZE(ptrba), yvec0; +ADD2_SY yvec6, yvec11, yvec11; +ADD2_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec6, yvec10, yvec10; +ADD2_SY yvec7, yvec8, yvec8; + +######### Unroll 4 ################## +PREFETCH0 (PRESIZE+48)*SIZE(ptrba); +LD_SY 56*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADDQ $64*SIZE, ptrba; +ADD1_SY yvec6, yvec15, yvec15; +ADD1_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +ODUP_SY 24*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 +ADDQ $32*SIZE, ptrbb; +ADD1_SY yvec6, yvec14, yvec14; +ADD1_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. +ADD1_SY yvec6, yvec11, yvec11; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +MUL_SY yvec1, yvec5, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec10, yvec10; +ADD1_SY yvec7, yvec8, yvec8; + +VPERMILP_SY $0xb1, yvec1, yvec1; +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +ADD2_SY yvec6, yvec15, yvec15; +ADD2_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +EDUP_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec1, yvec3, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; +ADD2_SY yvec6, yvec14, yvec14; +ADD2_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +LD_SY 0*SIZE(ptrba), yvec0; +ADD2_SY yvec6, yvec11, yvec11; +ADD2_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec6, yvec10, yvec10; +ADD2_SY yvec7, yvec8, yvec8; +.L2_bodyE:; +DECQ k; +JG .L2_bodyB; +ALIGN_5 +.L2_loopE:; +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L3_loopE; +ALIGN_5 +.L3_loopB: +######### Unroll 1 ################## +PREFETCH0 PRESIZE*SIZE(ptrba) +LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +ADD1_SY yvec6, yvec15, yvec15; +ADD1_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec14, yvec14; +ADD1_SY yvec7, yvec12, yvec12; + +ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 +ADD1_SY yvec6, yvec11, yvec11; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. +ADD1_SY yvec6, yvec10, yvec10; +ADD1_SY yvec7, yvec8, yvec8; + +VPERMILP_SY $0xb1, yvec1, yvec1; +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +ADD2_SY yvec6, yvec15, yvec15; +ADD2_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD2_SY yvec6, yvec14, yvec14; +ADD2_SY yvec7, yvec12, yvec12; + +EDUP_SY 8*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; +ADD2_SY yvec6, yvec11, yvec11; +ADD2_SY yvec7, yvec9, yvec9; + +LD_SY 16*SIZE(ptrba), yvec0; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec6, yvec10, yvec10; +ADD2_SY yvec7, yvec8, yvec8; + +######### Unroll 2 ################## +PREFETCH0 (PRESIZE+16)*SIZE(ptrba) +LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +ADDQ $32*SIZE, ptrba +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +ADD1_SY yvec6, yvec15, yvec15; +ADD1_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec14, yvec14; +ADD1_SY yvec7, yvec12, yvec12; + +ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADDQ $16*SIZE, ptrbb; +VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 +ADD1_SY yvec6, yvec11, yvec11; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. +ADD1_SY yvec6, yvec10, yvec10; +ADD1_SY yvec7, yvec8, yvec8; + +VPERMILP_SY $0xb1, yvec1, yvec1; +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +ADD2_SY yvec6, yvec15, yvec15; +ADD2_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD2_SY yvec6, yvec14, yvec14; +ADD2_SY yvec7, yvec12, yvec12; + +EDUP_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; +ADD2_SY yvec6, yvec11, yvec11; +ADD2_SY yvec7, yvec9, yvec9; + +LD_SY 0*SIZE(ptrba), yvec0; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec6, yvec10, yvec10; +ADD2_SY yvec7, yvec8, yvec8; +.L3_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L4_loopE; +ALIGN_5 +.L4_loopB:; +######### Unroll 1 ################## +PREFETCH0 PRESIZE*SIZE(ptrba) +LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +ADDQ $16*SIZE, ptrba; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +ADD1_SY yvec6, yvec15, yvec15; +ADD1_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +ADD1_SY yvec6, yvec14, yvec14; +ADD1_SY yvec7, yvec12, yvec12; + +ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADDQ $8*SIZE, ptrbb; +VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 +ADD1_SY yvec6, yvec11, yvec11; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. +ADD1_SY yvec6, yvec10, yvec10; +ADD1_SY yvec7, yvec8, yvec8; + +VPERMILP_SY $0xb1, yvec1, yvec1; +MUL_SY yvec0, yvec2, yvec6; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 +ADD2_SY yvec6, yvec15, yvec15; +ADD2_SY yvec7, yvec13, yvec13; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec14, yvec14; +SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec12, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +VPERMILP_SY $0x4e, yvec2, yvec3; +ADD2_SY yvec6, yvec11, yvec11; +ADD2_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec6, yvec10, yvec10; +ADD2_SY yvec7, yvec8, yvec8; + +.L4_loopE:; +#### Handle #### +XOR_SY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_SY yvec15, yvec7, yvec15; +ADDSUB_SY yvec14, yvec7, yvec14; +ADDSUB_SY yvec13, yvec7, yvec13; +ADDSUB_SY yvec12, yvec7, yvec12; +ADDSUB_SY yvec11, yvec7, yvec11; +ADDSUB_SY yvec10, yvec7, yvec10; +ADDSUB_SY yvec9, yvec7, yvec9; +ADDSUB_SY yvec8, yvec7, yvec8; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_SY yvec15, yvec7, yvec15; +SUB_SY yvec14, yvec7, yvec14; +SUB_SY yvec13, yvec7, yvec13; +SUB_SY yvec12, yvec7, yvec12; +SUB_SY yvec11, yvec7, yvec11; +SUB_SY yvec10, yvec7, yvec10; +SUB_SY yvec9, yvec7, yvec9; +SUB_SY yvec8, yvec7, yvec8; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_SY $0xb1, yvec15, yvec15; +VPERMILP_SY $0xb1, yvec14, yvec14; +VPERMILP_SY $0xb1, yvec13, yvec13; +VPERMILP_SY $0xb1, yvec12, yvec12; +VPERMILP_SY $0xb1, yvec11, yvec11; +VPERMILP_SY $0xb1, yvec10, yvec10; +VPERMILP_SY $0xb1, yvec9, yvec9; +VPERMILP_SY $0xb1, yvec8, yvec8; +ADDSUB_SY yvec15, yvec7, yvec15; +ADDSUB_SY yvec14, yvec7, yvec14; +ADDSUB_SY yvec13, yvec7, yvec13; +ADDSUB_SY yvec12, yvec7, yvec12; +ADDSUB_SY yvec11, yvec7, yvec11; +ADDSUB_SY yvec10, yvec7, yvec10; +ADDSUB_SY yvec9, yvec7, yvec9; +ADDSUB_SY yvec8, yvec7, yvec8; +VPERMILP_SY $0xb1, yvec15, yvec15; +VPERMILP_SY $0xb1, yvec14, yvec14; +VPERMILP_SY $0xb1, yvec13, yvec13; +VPERMILP_SY $0xb1, yvec12, yvec12; +VPERMILP_SY $0xb1, yvec11, yvec11; +VPERMILP_SY $0xb1, yvec10, yvec10; +VPERMILP_SY $0xb1, yvec9, yvec9; +VPERMILP_SY $0xb1, yvec8, yvec8; +#endif +##### Load Alpha #### +BROAD_SY MEMALPHA_R,yvec7; +BROAD_SY MEMALPHA_I,yvec6; +##### Multiply Alpha #### +VPERMILP_SY $0xb1,yvec15, yvec5; +MUL_SY yvec15, yvec7, yvec15; +MUL_SY yvec5, yvec6, yvec5; +ADDSUB_SY yvec5, yvec15, yvec15; +VPERMILP_SY $0xb1,yvec14, yvec4; +MUL_SY yvec14, yvec7, yvec14; +MUL_SY yvec4, yvec6, yvec4; +ADDSUB_SY yvec4, yvec14, yvec14; +VPERMILP_SY $0xb1,yvec13, yvec3; +MUL_SY yvec13, yvec7, yvec13; +MUL_SY yvec3, yvec6, yvec3; +ADDSUB_SY yvec3, yvec13, yvec13; +VPERMILP_SY $0xb1,yvec12, yvec2; +MUL_SY yvec12, yvec7, yvec12; +MUL_SY yvec2, yvec6, yvec2; +ADDSUB_SY yvec2, yvec12, yvec12; +VPERMILP_SY $0xb1,yvec11, yvec1; +MUL_SY yvec11, yvec7, yvec11; +MUL_SY yvec1, yvec6, yvec1; +ADDSUB_SY yvec1, yvec11, yvec11; +VPERMILP_SY $0xb1,yvec10, yvec0; +MUL_SY yvec10, yvec7, yvec10; +MUL_SY yvec0, yvec6, yvec0; +ADDSUB_SY yvec0, yvec10, yvec10; +VPERMILP_SY $0xb1,yvec9, yvec5; +MUL_SY yvec9, yvec7, yvec9; +MUL_SY yvec5, yvec6, yvec5; +ADDSUB_SY yvec5, yvec9, yvec9; +VPERMILP_SY $0xb1,yvec8, yvec4; +MUL_SY yvec8, yvec7, yvec8; +MUL_SY yvec4, yvec6, yvec4; +ADDSUB_SY yvec4, yvec8, yvec8; +#### Shuffle Results #### +MOV_SY yvec15,yvec7; +REVS_SY $0xe4,yvec13,yvec15,yvec15; +REVS_SY $0xe4,yvec7,yvec13,yvec13; +MOV_SY yvec14,yvec7; +REVS_SY $0xe4,yvec12,yvec14,yvec14; +REVS_SY $0xe4,yvec7,yvec12,yvec12; +MOV_SY yvec11,yvec7; +REVS_SY $0xe4,yvec9,yvec11,yvec11; +REVS_SY $0xe4,yvec7,yvec9,yvec9; +MOV_SY yvec10,yvec7; +REVS_SY $0xe4,yvec8,yvec10,yvec10; +REVS_SY $0xe4,yvec7,yvec8,yvec8; +#### Store Back #### +#### Testing alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L4_loopEx; +ALIGN_5 +EXTRA_SY $1,yvec15,xvec7; +EXTRA_SY $1,yvec14,xvec6; +EXTRA_SY $1,yvec13,xvec5; +EXTRA_SY $1,yvec12,xvec4; +EXTRA_SY $1,yvec11,xvec3; +EXTRA_SY $1,yvec10,xvec2; +EXTRA_SY $1,yvec9,xvec1; +EXTRA_SY $1,yvec8,xvec0; +#ifndef TRMMKERNEL +ADD_SY 0*SIZE(C0),xvec15, xvec15; +ADD_SY 4*SIZE(C1),xvec7, xvec7; +ADD_SY 8*SIZE(C0),xvec14, xvec14; +ADD_SY 12*SIZE(C1),xvec6, xvec6; +ADD_SY 0*SIZE(C0,ldc,1),xvec13, xvec13; +ADD_SY 4*SIZE(C1,ldc,1),xvec5, xvec5; +ADD_SY 8*SIZE(C0,ldc,1),xvec12, xvec12; +ADD_SY 12*SIZE(C1,ldc,1),xvec4, xvec4; +ADD_SY 0*SIZE(C1),xvec11, xvec11; +ADD_SY 4*SIZE(C0),xvec3, xvec3; +ADD_SY 8*SIZE(C1),xvec10, xvec10; +ADD_SY 12*SIZE(C0),xvec2, xvec2; +ADD_SY 0*SIZE(C1,ldc,1),xvec9, xvec9; +ADD_SY 4*SIZE(C0,ldc,1),xvec1, xvec1; +ADD_SY 8*SIZE(C1,ldc,1),xvec8, xvec8; +ADD_SY 12*SIZE(C0,ldc,1),xvec0, xvec0; +#endif +ST_SY xvec15,0*SIZE(C0); +ST_SY xvec7,4*SIZE(C1); +ST_SY xvec14,8*SIZE(C0); +ST_SY xvec6,12*SIZE(C1); +ST_SY xvec13,0*SIZE(C0,ldc,1); +ST_SY xvec5,4*SIZE(C1,ldc,1); +ST_SY xvec12,8*SIZE(C0,ldc,1); +ST_SY xvec4,12*SIZE(C1,ldc,1); +ST_SY xvec11,0*SIZE(C1); +ST_SY xvec3,4*SIZE(C0); +ST_SY xvec10,8*SIZE(C1); +ST_SY xvec2,12*SIZE(C0); +ST_SY xvec9,0*SIZE(C1,ldc,1); +ST_SY xvec1,4*SIZE(C0,ldc,1); +ST_SY xvec8,8*SIZE(C1,ldc,1); +ST_SY xvec0,12*SIZE(C0,ldc,1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk; +#endif +ADDQ $16*SIZE,C0; +ADDQ $16*SIZE,C1; +.L1_bodyE:; +DECQ i; +JG .L1_bodyB; +JMP .L1_loopE; +ALIGN_5 +.L4_loopEx: +EXTRA_SY $1, yvec15, xvec7; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C0), xvec6, xvec6; +LDH_SY 2*SIZE(C0), xvec6, xvec6; +ADD_SY xvec6, xvec15, xvec15; +#endif +STL_SY xvec15, 0*SIZE(C0); +STH_SY xvec15, 2*SIZE(C0); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C1), xvec5, xvec5; +LDH_SY 6*SIZE(C1), xvec5, xvec5; +ADD_SY xvec5, xvec7, xvec7; +#endif +STL_SY xvec7, 4*SIZE(C1); +STH_SY xvec7, 6*SIZE(C1); + +EXTRA_SY $1, yvec14, xvec6; +#ifndef TRMMKERNEL +LDL_SY 8*SIZE(C0), xvec5, xvec5; +LDH_SY 10*SIZE(C0), xvec5, xvec5; +ADD_SY xvec5, xvec14, xvec14; +#endif +STL_SY xvec14, 8*SIZE(C0); +STH_SY xvec14, 10*SIZE(C0); +#ifndef TRMMKERNEL +LDL_SY 12*SIZE(C1), xvec4, xvec4; +LDH_SY 14*SIZE(C1), xvec4, xvec4; +ADD_SY xvec4, xvec6, xvec6; +#endif +STL_SY xvec6, 12*SIZE(C1); +STH_SY xvec6, 14*SIZE(C1); + +EXTRA_SY $1, yvec13, xvec5; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C0, ldc, 1), xvec4, xvec4; +LDH_SY 2*SIZE(C0, ldc, 1), xvec4, xvec4; +ADD_SY xvec4, xvec13, xvec13; +#endif +STL_SY xvec13, 0*SIZE(C0, ldc, 1); +STH_SY xvec13, 2*SIZE(C0, ldc, 1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SY 6*SIZE(C1, ldc, 1), xvec3, xvec3; +ADD_SY xvec3, xvec5, xvec5; +#endif +STL_SY xvec5, 4*SIZE(C1, ldc, 1); +STH_SX xvec5, 6*SIZE(C1, ldc, 1); + +EXTRA_SY $1, yvec12, xvec4; +#ifndef TRMMKERNEL +LDL_SY 8*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_SY 10*SIZE(C0, ldc, 1), xvec3, xvec3; +ADD_SY xvec3, xvec12, xvec12; +#endif +STL_SY xvec12, 8*SIZE(C0, ldc, 1); +STH_SY xvec12, 10*SIZE(C0, ldc, 1); +#ifndef TRMMKERNEL +LDL_SY 12*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SY 14*SIZE(C1, ldc, 1), xvec2, xvec2; +ADD_SY xvec2, xvec4, xvec4; +#endif +STL_SY xvec4, 12*SIZE(C1, ldc, 1); +STH_SY xvec4, 14*SIZE(C1, ldc, 1); + +EXTRA_SY $1, yvec11, xvec3; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C1), xvec2, xvec2; +LDH_SY 2*SIZE(C1), xvec2, xvec2; +ADD_SY xvec2, xvec11, xvec11; +#endif +STL_SY xvec11, 0*SIZE(C1); +STH_SY xvec11, 2*SIZE(C1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C0), xvec1, xvec1; +LDH_SY 6*SIZE(C0), xvec1, xvec1; +ADD_SY xvec1, xvec3, xvec3; +#endif +STL_SY xvec3, 4*SIZE(C0); +STH_SY xvec3, 6*SIZE(C0); + +EXTRA_SY $1, yvec10, xvec2; +#ifndef TRMMKERNEL +LDL_SY 8*SIZE(C1), xvec1, xvec1; +LDH_SY 10*SIZE(C1), xvec1, xvec1; +ADD_SY xvec1, xvec10, xvec10; +#endif +STL_SY xvec10, 8*SIZE(C1); +STH_SY xvec10, 10*SIZE(C1); +#ifndef TRMMKERNEL +LDL_SY 12*SIZE(C0), xvec0, xvec0; +LDH_SY 14*SIZE(C0), xvec0, xvec0; +ADD_SY xvec0, xvec2, xvec2; +#endif +STL_SY xvec2, 12*SIZE(C0); +STH_SY xvec2, 14*SIZE(C0); + +EXTRA_SY $1, yvec9, xvec1; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C1, ldc, 1), xvec7, xvec7; +LDH_SY 2*SIZE(C1, ldc, 1), xvec7, xvec7; +ADD_SY xvec7, xvec9, xvec9; +#endif +STL_SY xvec9, 0*SIZE(C1, ldc, 1); +STH_SY xvec9, 2*SIZE(C1, ldc, 1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SY 6*SIZE(C0, ldc, 1), xvec6, xvec6; +ADD_SY xvec6, xvec1, xvec1; +#endif +STL_SY xvec1, 4*SIZE(C0, ldc, 1); +STH_SY xvec1, 6*SIZE(C0, ldc, 1); + +EXTRA_SY $1, yvec8, xvec0; +#ifndef TRMMKERNEL +LDL_SY 8*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_SY 10*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_SY xvec6, xvec8, xvec8; +#endif +STL_SY xvec8, 8*SIZE(C1, ldc, 1); +STH_SY xvec8, 10*SIZE(C1, ldc, 1); +#ifndef TRMMKERNEL +LDL_SY 12*SIZE(C0, ldc, 1), xvec5, xvec5; +LDH_SY 14*SIZE(C0, ldc, 1), xvec5, xvec5; +ADD_SY xvec5, xvec0, xvec0; +#endif +STL_SY xvec0, 12*SIZE(C0, ldc, 1); +STH_SY xvec0, 14*SIZE(C0, ldc, 1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk; +#endif +ADDQ $16*SIZE, C0; +ADDQ $16*SIZE, C1; +DECQ i; +JG .L1_bodyB; +ALIGN_5; +.L1_loopE:; +TEST $4, bm; +JLE .L5_loopE; +ALIGN_5 +.L5_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec11, yvec11, yvec11; +XOR_SY yvec9, yvec9, yvec9; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L8_loopE; +ALIGN_5 +.L8_bodyB: +#### Unroll times 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +VPERMILP_SY $0xb1, yvec0, yvec1; +EDUP_SY 0*SIZE(ptrbb), yvec2; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +ADD1_SY yvec7, yvec13, yvec13; + +ODUP_SY 0*SIZE(ptrbb), yvec2; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec0, yvec4, yvec6; +ADD1_SY yvec6, yvec11, yvec11; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec5, yvec7; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec13, yvec13; + +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec1, yvec4, yvec6; +ADD2_SY yvec6, yvec11, yvec11; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec7, yvec9, yvec9; + +#### Unroll time 2 #### +LD_SY 8*SIZE(ptrba), yvec0; +VPERMILP_SY $0xb1, yvec0, yvec1; +EDUP_SY 8*SIZE(ptrbb), yvec2; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +ADD1_SY yvec7, yvec13, yvec13; + +ODUP_SY 8*SIZE(ptrbb), yvec2; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec0, yvec4, yvec6; +ADD1_SY yvec6, yvec11, yvec11; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec5, yvec7; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec13, yvec13; + +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec1, yvec4, yvec6; +ADD2_SY yvec6, yvec11, yvec11; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec7, yvec9, yvec9; + +#### Unroll time 3 #### +LD_SY 16*SIZE(ptrba), yvec0; +VPERMILP_SY $0xb1, yvec0, yvec1; +EDUP_SY 16*SIZE(ptrbb), yvec2; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +ADD1_SY yvec7, yvec13, yvec13; + +ODUP_SY 16*SIZE(ptrbb), yvec2; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec0, yvec4, yvec6; +ADD1_SY yvec6, yvec11, yvec11; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec5, yvec7; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec13, yvec13; + +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec1, yvec4, yvec6; +ADD2_SY yvec6, yvec11, yvec11; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec7, yvec9, yvec9; + +#### Unroll time 3 #### +LD_SY 24*SIZE(ptrba), yvec0; +VPERMILP_SY $0xb1, yvec0, yvec1; +EDUP_SY 24*SIZE(ptrbb), yvec2; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +ADD1_SY yvec7, yvec13, yvec13; + +ODUP_SY 24*SIZE(ptrbb), yvec2; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec0, yvec4, yvec6; +ADD1_SY yvec6, yvec11, yvec11; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec5, yvec7; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec13, yvec13; + +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec1, yvec4, yvec6; +ADD2_SY yvec6, yvec11, yvec11; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec7, yvec9, yvec9; +ADDQ $32*SIZE, ptrba; +ADDQ $32*SIZE, ptrbb; +DECQ k; +JG .L8_bodyB; +ALIGN_5 +.L8_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L9_loopE; +ALIGN_5 +.L9_bodyB: +#### Unroll times 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +VPERMILP_SY $0xb1, yvec0, yvec1; +EDUP_SY 0*SIZE(ptrbb), yvec2; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +ADD1_SY yvec7, yvec13, yvec13; + +ODUP_SY 0*SIZE(ptrbb), yvec2; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec0, yvec4, yvec6; +ADD1_SY yvec6, yvec11, yvec11; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec5, yvec7; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec13, yvec13; + +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec1, yvec4, yvec6; +ADD2_SY yvec6, yvec11, yvec11; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec7, yvec9, yvec9; + +#### Unroll time 2 #### +LD_SY 8*SIZE(ptrba), yvec0; +VPERMILP_SY $0xb1, yvec0, yvec1; +EDUP_SY 8*SIZE(ptrbb), yvec2; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +ADD1_SY yvec7, yvec13, yvec13; + +ODUP_SY 8*SIZE(ptrbb), yvec2; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec0, yvec4, yvec6; +ADD1_SY yvec6, yvec11, yvec11; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec5, yvec7; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec13, yvec13; + +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec1, yvec4, yvec6; +ADD2_SY yvec6, yvec11, yvec11; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec7, yvec9, yvec9; +ADDQ $16*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; + +.L9_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L10_loopE; +ALIGN_5 +.L10_bodyB: +#### Unroll times 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +VPERMILP_SY $0xb1, yvec0, yvec1; +EDUP_SY 0*SIZE(ptrbb), yvec2; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +ADD1_SY yvec7, yvec13, yvec13; + +ODUP_SY 0*SIZE(ptrbb), yvec2; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec0, yvec4, yvec6; +ADD1_SY yvec6, yvec11, yvec11; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec5, yvec7; +ADD1_SY yvec7, yvec9, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec1, yvec3, yvec7; +ADD2_SY yvec7, yvec13, yvec13; + +SHUF_SY $0x03, yvec3, yvec3, yvec5; +MUL_SY yvec1, yvec4, yvec6; +ADD2_SY yvec6, yvec11, yvec11; +MUL_SY yvec1, yvec5, yvec7; +ADD2_SY yvec7, yvec9, yvec9; +ADDQ $8*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L10_loopE: +#### Handle #### +XOR_SY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_SY yvec15, yvec7, yvec15; +ADDSUB_SY yvec13, yvec7, yvec13; +ADDSUB_SY yvec11, yvec7, yvec11; +ADDSUB_SY yvec9, yvec7, yvec9; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_SY yvec15, yvec7, yvec15; +SUB_SY yvec13, yvec7, yvec13; +SUB_SY yvec11, yvec7, yvec11; +SUB_SY yvec9, yvec7, yvec9; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_SY $0xb1, yvec15, yvec15; +VPERMILP_SY $0xb1, yvec13, yvec13; +VPERMILP_SY $0xb1, yvec11, yvec11; +VPERMILP_SY $0xb1, yvec9, yvec9; +ADDSUB_SY yvec15, yvec7, yvec15; +ADDSUB_SY yvec13, yvec7, yvec13; +ADDSUB_SY yvec11, yvec7, yvec11; +ADDSUB_SY yvec9, yvec7, yvec9; +VPERMILP_SY $0xb1, yvec15, yvec15; +VPERMILP_SY $0xb1, yvec13, yvec13; +VPERMILP_SY $0xb1, yvec11, yvec11; +VPERMILP_SY $0xb1, yvec9, yvec9; +#endif +##### Load Alpha #### +BROAD_SY MEMALPHA_R,yvec7; +BROAD_SY MEMALPHA_I,yvec6; +##### Multiply Alpha #### +VPERMILP_SY $0xb1,yvec15, yvec5; +MUL_SY yvec15, yvec7, yvec15; +MUL_SY yvec5, yvec6, yvec5; +ADDSUB_SY yvec5, yvec15, yvec15; +VPERMILP_SY $0xb1,yvec13, yvec3; +MUL_SY yvec13, yvec7, yvec13; +MUL_SY yvec3, yvec6, yvec3; +ADDSUB_SY yvec3, yvec13, yvec13; +VPERMILP_SY $0xb1,yvec11, yvec1; +MUL_SY yvec11, yvec7, yvec11; +MUL_SY yvec1, yvec6, yvec1; +ADDSUB_SY yvec1, yvec11, yvec11; +VPERMILP_SY $0xb1,yvec9, yvec5; +MUL_SY yvec9, yvec7, yvec9; +MUL_SY yvec5, yvec6, yvec5; +ADDSUB_SY yvec5, yvec9, yvec9; +#### Writing back #### +#### Shuffle Results #### +MOV_SY yvec15,yvec7; +REVS_SY $0xe4,yvec13,yvec15,yvec15; +REVS_SY $0xe4,yvec7,yvec13,yvec13; +MOV_SY yvec11,yvec7; +REVS_SY $0xe4,yvec9,yvec11,yvec11; +REVS_SY $0xe4,yvec7,yvec9,yvec9; +#### Writing back #### +EXTRA_SY $1, yvec15, xvec7; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec6, xvec6; +LDH_SX 2*SIZE(C0), xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C0); +#ifndef TRMMKERNEL +LDL_SX 4*SIZE(C1), xvec4, xvec4; +LDH_SX 6*SIZE(C1), xvec4, xvec4; +ADD_SX xvec4, xvec7, xvec7; +#endif +STL_SX xvec7, 4*SIZE(C1); +STH_SX xvec7, 6*SIZE(C1); + +EXTRA_SY $1, yvec13, xvec5; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0, ldc, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, ldc, 1), xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +#endif +STL_SX xvec13, 0*SIZE(C0, ldc, 1); +STH_SX xvec13, 2*SIZE(C0, ldc, 1); +#ifndef TRMMKERNEL +LDL_SX 4*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 6*SIZE(C1, ldc, 1), xvec2, xvec2; +ADD_SX xvec2, xvec5, xvec5; +#endif +STL_SX xvec5, 4*SIZE(C1, ldc, 1); +STH_SX xvec5, 6*SIZE(C1, ldc, 1); + +EXTRA_SY $1, yvec11, xvec3; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1), xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +#endif +STL_SX xvec11, 0*SIZE(C1); +STH_SX xvec11, 2*SIZE(C1); +#ifndef TRMMKERNEL +LDL_SX 4*SIZE(C0), xvec0, xvec0; +LDH_SX 6*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec3, xvec3; +#endif +STL_SX xvec3, 4*SIZE(C0); +STH_SX xvec3, 6*SIZE(C0); + +EXTRA_SY $1, yvec9, xvec1; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C1, ldc, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, ldc, 1), xvec0, xvec0; +ADD_SX xvec0, xvec9, xvec9; +#endif +STL_SX xvec9, 0*SIZE(C1, ldc, 1); +STH_SX xvec9, 2*SIZE(C1, ldc, 1); +#ifndef TRMMKERNEL +LDL_SX 4*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 6*SIZE(C0, ldc, 1), xvec6, xvec6; +ADD_SX xvec6, xvec1, xvec1; +#endif +STL_SX xvec1, 4*SIZE(C0, ldc, 1); +STH_SX xvec1, 6*SIZE(C0, ldc, 1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif + +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; + +.L5_loopE: +TEST $2, bm; +JLE .L6_loopE; +ALIGN_5 +.L6_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#### Initial Results Register #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L11_loopE; +ALIGN_5 +.L11_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 +EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 +SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xb1, xvec0, xvec1; +ODUP_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x4e, xvec2, xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; + +ODUP_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; + +LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 +EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 +SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; + +EDUP_SX 12*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xb1, xvec0, xvec1; +ODUP_SX 8*SIZE(ptrbb), xvec2; +SHUF_SX $0x4e, xvec2, xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; + +ODUP_SX 12*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; + +LD_SX 8*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 +EDUP_SX 16*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 +SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; + +EDUP_SX 20*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xb1, xvec0, xvec1; +ODUP_SX 16*SIZE(ptrbb), xvec2; +SHUF_SX $0x4e, xvec2, xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; + +ODUP_SX 20*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; + +LD_SX 12*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 +EDUP_SX 24*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 +SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; + +EDUP_SX 28*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xb1, xvec0, xvec1; +ODUP_SX 24*SIZE(ptrbb), xvec2; +SHUF_SX $0x4e, xvec2, xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; + +ODUP_SX 28*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; +ADDQ $16*SIZE, ptrba; +ADDQ $32*SIZE, ptrbb; +DECQ k; +JG .L11_bodyB; +ALIGN_5 +.L11_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L12_loopE; +ALIGN_5 +.L12_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 +EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 +SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xb1, xvec0, xvec1; +ODUP_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x4e, xvec2, xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; + +ODUP_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; + +LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 +EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 +SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; + +EDUP_SX 12*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xb1, xvec0, xvec1; +ODUP_SX 8*SIZE(ptrbb), xvec2; +SHUF_SX $0x4e, xvec2, xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; + +ODUP_SX 12*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; +ADDQ $8*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; + +.L12_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L13_loopE; +ALIGN_5 +.L13_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 +EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 +SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xb1, xvec0, xvec1; +ODUP_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x4e, xvec2, xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; + +ODUP_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; +ADDQ $4*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L13_loopE: +#### Handle #### +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec13, xvec7, xvec7; +MOV_SX xvec7, xvec13; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec12, xvec7, xvec7; +MOV_SX xvec7, xvec12; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec13, xvec7, xvec7; +MOV_SX xvec7, xvec13; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec12, xvec7, xvec7; +MOV_SX xvec7, xvec12; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +SHUF_SX $0xb1, xvec13, xvec13; +SHUF_SX $0xb1, xvec12, xvec12; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec13, xvec7, xvec7; +MOV_SX xvec7, xvec13; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec12, xvec7, xvec7; +MOV_SX xvec7, xvec12; +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +SHUF_SX $0xb1, xvec13, xvec13; +SHUF_SX $0xb1, xvec12, xvec12; +#endif +##### Load Alpha #### +BROAD_SX MEMALPHA_R,xvec7; +BROAD_SX MEMALPHA_I,xvec6; +##### Multiply Alpha #### +VPERMILP_SX $0xb1,xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +VPERMILP_SX $0xb1,xvec14, xvec4; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; +VPERMILP_SX $0xb1,xvec13, xvec3; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; +VPERMILP_SX $0xb1,xvec12, xvec2; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; +#### Writing back #### +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0, ldc,1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc,1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C0, ldc, 1); +STL_SX xvec14, 0*SIZE(C0, ldc, 1); +STH_SX xvec14, 2*SIZE(C0); +STL_SX xvec13, 0*SIZE(C1); +STH_SX xvec13, 2*SIZE(C1, ldc, 1); +STL_SX xvec12, 0*SIZE(C1, ldc, 1); +STH_SX xvec12, 2*SIZE(C1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif + +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; + +.L6_loopE: +TEST $1, bm; +JLE .L7_loopE; +ALIGN_5 +.L7_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L14_loopE; +ALIGN_5 +.L14_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +LD_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0xb1, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; + +BROAD_SX 1*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; + +BROAD_SX 2*SIZE(ptrba), xvec0; +LD_SX 8*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +LD_SX 12*SIZE(ptrbb), xvec4; +SHUF_SX $0xb1, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; + +BROAD_SX 3*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; + +BROAD_SX 4*SIZE(ptrba), xvec0; +LD_SX 16*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +LD_SX 20*SIZE(ptrbb), xvec4; +SHUF_SX $0xb1, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; + +BROAD_SX 5*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; + +BROAD_SX 6*SIZE(ptrba), xvec0; +LD_SX 24*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +LD_SX 28*SIZE(ptrbb), xvec4; +SHUF_SX $0xb1, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; + +BROAD_SX 7*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; +ADDQ $8*SIZE, ptrba; +ADDQ $32*SIZE, ptrbb; +DECQ k; +JG .L14_bodyB; +ALIGN_5 +.L14_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L15_loopE; +ALIGN_5 +.L15_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +LD_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0xb1, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; + +BROAD_SX 1*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; + +BROAD_SX 2*SIZE(ptrba), xvec0; +LD_SX 8*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +LD_SX 12*SIZE(ptrbb), xvec4; +SHUF_SX $0xb1, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; + +BROAD_SX 3*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; +ADDQ $4*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; + +.L15_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L16_loopE; +ALIGN_5 +.L16_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +LD_SX 4*SIZE(ptrbb), xvec4; +SHUF_SX $0xb1, xvec4, xvec5; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; + +BROAD_SX 1*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; +ADDQ $2*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L16_loopE: +#### Handle #### +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +#endif +##### Load Alpha #### +BROAD_SX MEMALPHA_R,xvec7; +BROAD_SX MEMALPHA_I,xvec6; +##### Multiply Alpha #### +VPERMILP_SX $0xb1,xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +VPERMILP_SX $0xb1,xvec14, xvec4; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; +#### Writing back #### +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 0*SIZE(C0, ldc, 1); +STL_SX xvec14, 0*SIZE(C1); +STH_SX xvec14, 0*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif + +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +.L7_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $4, kk; +#endif +MOVQ bk,k; +SALQ $5,k; +ADDQ k,bb; +LEAQ (C,ldc,4),C; +.L0_bodyE:; +DECQ j; +JG .L0_bodyB; +ALIGN_5; +.L0_loopE:; +TEST $2, bn; +JLE .L20_loopE; +ALIGN_5 +.L20_bodyB: +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ C, C0; +LEAQ (C, ldc, 1), C1; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $3, i; +JLE .L21_loopE; +ALIGN_5 +.L21_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +XOR_SY yvec11, yvec11, yvec11; +XOR_SY yvec10, yvec10, yvec10; +XOR_SY yvec9, yvec9, yvec9; +XOR_SY yvec8, yvec8, yvec8; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L211_loopE; +ALIGN_5 +.L211_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +LD_SX 8*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; + +LD_SX 12*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 16*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 20*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +LD_SX 24*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; + +LD_SX 28*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; + +EDUP_SX 8*SIZE(ptrbb), xvec4; +ODUP_SX 8*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 32*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 36*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +LD_SX 40*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; + +LD_SX 44*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; + +EDUP_SX 12*SIZE(ptrbb), xvec4; +ODUP_SX 12*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 48*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 52*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +LD_SX 56*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; + +LD_SX 60*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; +ADDQ $64*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L211_bodyB; +ALIGN_5 +.L211_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L212_loopE; +ALIGN_5 +.L212_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +LD_SX 8*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; + +LD_SX 12*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 16*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 20*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +LD_SX 24*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; + +LD_SX 28*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; +ADDQ $32*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L212_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L213_loopE; +ALIGN_5 +.L213_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +LD_SX 8*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; + +LD_SX 12*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; +ADDQ $16*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb + +.L213_loopE: +#### Handle #### +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec13, xvec7, xvec7; +MOV_SX xvec7, xvec13; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec12, xvec7, xvec7; +MOV_SX xvec7, xvec12; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec10, xvec7, xvec7; +MOV_SX xvec7, xvec10; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec9, xvec7, xvec7; +MOV_SX xvec7, xvec9; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec8, xvec7, xvec7; +MOV_SX xvec7, xvec8; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec13, xvec7, xvec7; +MOV_SX xvec7, xvec13; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec12, xvec7, xvec7; +MOV_SX xvec7, xvec12; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec10, xvec7, xvec7; +MOV_SX xvec7, xvec10; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec9, xvec7, xvec7; +MOV_SX xvec7, xvec9; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec8, xvec7, xvec7; +MOV_SX xvec7, xvec8; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +SHUF_SX $0xb1, xvec13, xvec13; +SHUF_SX $0xb1, xvec12, xvec12; +SHUF_SX $0xb1, xvec11, xvec11; +SHUF_SX $0xb1, xvec10, xvec10; +SHUF_SX $0xb1, xvec9, xvec9; +SHUF_SX $0xb1, xvec8, xvec8; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec13, xvec7, xvec7; +MOV_SX xvec7, xvec13; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec12, xvec7, xvec7; +MOV_SX xvec7, xvec12; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec10, xvec7, xvec7; +MOV_SX xvec7, xvec10; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec9, xvec7, xvec7; +MOV_SX xvec7, xvec9; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec8, xvec7, xvec7; +MOV_SX xvec7, xvec8; +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +SHUF_SX $0xb1, xvec13, xvec13; +SHUF_SX $0xb1, xvec12, xvec12; +SHUF_SX $0xb1, xvec11, xvec11; +SHUF_SX $0xb1, xvec10, xvec10; +SHUF_SX $0xb1, xvec9, xvec9; +SHUF_SX $0xb1, xvec8, xvec8; +#endif +#### Mulitply Alpha #### +BROAD_SX MEMALPHA_R, xvec7; +BROAD_SX MEMALPHA_I, xvec6; +#### Writng back #### +VPERMILP_SX $0xb1,xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +VPERMILP_SX $0xb1,xvec14, xvec4; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; +VPERMILP_SX $0xb1,xvec13, xvec3; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; +VPERMILP_SX $0xb1,xvec12, xvec2; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; +VPERMILP_SX $0xb1,xvec11, xvec1; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; +VPERMILP_SX $0xb1,xvec10, xvec0; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; +VPERMILP_SX $0xb1,xvec9, xvec5; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec9, xvec9; +VPERMILP_SX $0xb1,xvec8, xvec4; +MUL_SX xvec7, xvec8, xvec8; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec8, xvec8; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C1), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C1); +STL_SX xvec14, 4*SIZE(C0); +STH_SX xvec14, 6*SIZE(C1); +STL_SX xvec13, 8*SIZE(C0); +STH_SX xvec13, 10*SIZE(C1); +STL_SX xvec12, 12*SIZE(C0); +STH_SX xvec12, 14*SIZE(C1); +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 8*SIZE(C1), xvec6, xvec6; +LDH_SX 10*SIZE(C0), xvec6, xvec6; +LDL_SX 12*SIZE(C1), xvec7, xvec7; +LDH_SX 14*SIZE(C0), xvec7, xvec7; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; +#endif +STL_SX xvec11, 0*SIZE(C1); +STH_SX xvec11, 2*SIZE(C0); +STL_SX xvec10, 4*SIZE(C1); +STH_SX xvec10, 6*SIZE(C0); +STL_SX xvec9, 8*SIZE(C1); +STH_SX xvec9, 10*SIZE(C0); +STL_SX xvec8, 12*SIZE(C1); +STH_SX xvec8, 14*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk; +#endif + +ADDQ $16*SIZE, C0; +ADDQ $16*SIZE, C1; +DECQ i; +JG .L21_bodyB; +ALIGN_5 +.L21_loopE: +TEST $4, bm; +JLE .L22_loopE; +ALIGN_5 +.L22_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec11, yvec11, yvec11; +XOR_SY yvec10, yvec10, yvec10; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif + +SARQ $2, k; +JLE .L221_loopE; +ALIGN_5 +.L221_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +#### Unroll 2 ##### +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 8*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 12*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +#### Unroll 3 #### +EDUP_SX 8*SIZE(ptrbb), xvec4; +ODUP_SX 8*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 16*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 20*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +#### Unroll 4 #### +EDUP_SX 12*SIZE(ptrbb), xvec4; +ODUP_SX 12*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 24*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 28*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; +ADDQ $32*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L221_bodyB; +ALIGN_5 +.L221_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L222_loopE; +ALIGN_5 +.L222_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; + +#### Unroll 2 ##### +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 8*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 12*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; +ADDQ $16*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + + +.L222_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L223_loopE; +ALIGN_5 +.L223_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; +ADDQ $8*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L223_loopE: +#### Handle #### +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec10, xvec7, xvec7; +MOV_SX xvec7, xvec10; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec10, xvec7, xvec7; +MOV_SX xvec7, xvec10; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +SHUF_SX $0xb1, xvec11, xvec11; +SHUF_SX $0xb1, xvec10, xvec10; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec14, xvec7, xvec7; +MOV_SX xvec7, xvec14; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec10, xvec7, xvec7; +MOV_SX xvec7, xvec10; +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec14, xvec14; +SHUF_SX $0xb1, xvec11, xvec11; +SHUF_SX $0xb1, xvec10, xvec10; +#endif +#### Mulitply Alpha #### +BROAD_SX MEMALPHA_R, xvec7; +BROAD_SX MEMALPHA_I, xvec6; +#### Writng back #### +VPERMILP_SX $0xb1,xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +VPERMILP_SX $0xb1,xvec14, xvec4; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; +VPERMILP_SX $0xb1,xvec11, xvec1; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; +VPERMILP_SX $0xb1,xvec10, xvec0; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C1); +STL_SX xvec14, 4*SIZE(C0); +STH_SX xvec14, 6*SIZE(C1); +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +#endif +STL_SX xvec11, 0*SIZE(C1); +STH_SX xvec11, 2*SIZE(C0); +STL_SX xvec10, 4*SIZE(C1); +STH_SX xvec10, 6*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif + +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; + +.L22_loopE: +TEST $2, bm; +JLE .L23_loopE; +ALIGN_5 +.L23_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec11, yvec11, yvec11; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L231_loopE; +ALIGN_5 +.L231_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +#### Unroll 2 ##### +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +#### Unroll 3 #### +EDUP_SX 8*SIZE(ptrbb), xvec4; +ODUP_SX 8*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 8*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +#### Unroll 4 #### +EDUP_SX 12*SIZE(ptrbb), xvec4; +ODUP_SX 12*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 12*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; +ADDQ $16*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L231_bodyB; +ALIGN_5 +.L231_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L232_loopE; +ALIGN_5 +.L232_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; + +#### Unroll 2 ##### +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 4*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; +ADDQ $8*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L232_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L233_loopE; +ALIGN_5 +.L233_bodyB: +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x4e, xvec4, xvec6; +SHUF_SX $0x4e, xvec5, xvec7; + +LD_SX 0*SIZE(ptrba), xvec0; +MOV_SX xvec0, xvec1; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; +SHUF_SX $0xb1, xvec1, xvec2; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; + +MOV_SX xvec2, xvec3; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; +ADDQ $4*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L233_loopE: +#### Handle #### +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +SUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec11, xvec11; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +XOR_SY yvec7, yvec7, yvec7; +ADDSUB_SX xvec11, xvec7, xvec7; +MOV_SX xvec7, xvec11; +SHUF_SX $0xb1, xvec15, xvec15; +SHUF_SX $0xb1, xvec11, xvec11; +#endif +#### Mulitply Alpha #### +BROAD_SX MEMALPHA_R, xvec7; +BROAD_SX MEMALPHA_I, xvec6; +#### Writng back #### +VPERMILP_SX $0xb1,xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +VPERMILP_SX $0xb1,xvec11, xvec1; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C1); +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +#endif +STL_SX xvec11, 0*SIZE(C1); +STH_SX xvec11, 2*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif + +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; + +.L23_loopE: +TEST $1, bm; +JLE .L24_loopE; +ALIGN_5 +.L24_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L241_loopE; +ALIGN_5 +.L241_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +BROAD_SX 1*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +BROAD_SX 2*SIZE(ptrba), xvec0; +LD_SX 4*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +BROAD_SX 3*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +BROAD_SX 4*SIZE(ptrba), xvec0; +LD_SX 8*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +BROAD_SX 5*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +BROAD_SX 6*SIZE(ptrba), xvec0; +LD_SX 12*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +BROAD_SX 7*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L241_bodyB; +ALIGN_5 +.L241_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L242_loopE; +ALIGN_5 +.L242_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +BROAD_SX 1*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +BROAD_SX 2*SIZE(ptrba), xvec0; +LD_SX 4*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +BROAD_SX 3*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L242_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L243_loopE; +ALIGN_5 +.L243_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xb1, xvec2, xvec3; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; + +BROAD_SX 1*SIZE(ptrba), xvec1; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L243_loopE: +#### Handle #### +XOR_SY yvec7, yvec7, yvec7; +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +SHUF_SX $0xb1, xvec15, xvec15; +#endif +##### Load Alpha #### +BROAD_SX MEMALPHA_R,xvec7; +BROAD_SX MEMALPHA_I,xvec6; +##### Multiply Alpha #### +VPERMILP_SX $0xb1,xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +#### Writing back #### +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 0*SIZE(C1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +.L24_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $2, kk; +#endif +MOVQ bk, k; +SALQ $4, k; +ADDQ k, bb; +LEAQ (C, ldc, 2), C; +.L20_loopE: +TEST $1, bn; +JLE .L30_loopE; +ALIGN_5 +.L30_bodyB: +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ C, C0; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $3, i; +JLE .L31_loopE; +ALIGN_5 +.L31_bodyB: +MOVQ bb, ptrbb; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 8), ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +MOVQ bk, k; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L311_loopE; +ALIGN_5 +.L311_bodyB: +#### Unroll 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +LD_SY 8*SIZE(ptrba), yvec1; +BROAD_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +MUL_SY yvec1, yvec2, yvec7; +ADD1_SY yvec7, yvec14, yvec14; + +BROAD_SY 1*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +VPERMILP_SY $0xb1, yvec1, yvec5; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +MUL_SY yvec5, yvec3, yvec7; +ADD2_SY yvec7, yvec14, yvec14; + +#### Unroll 2 #### +LD_SY 16*SIZE(ptrba), yvec0; +LD_SY 24*SIZE(ptrba), yvec1; +BROAD_SY 2*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +MUL_SY yvec1, yvec2, yvec7; +ADD1_SY yvec7, yvec14, yvec14; + +BROAD_SY 3*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +VPERMILP_SY $0xb1, yvec1, yvec5; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +MUL_SY yvec5, yvec3, yvec7; +ADD2_SY yvec7, yvec14, yvec14; + +#### Unroll 3 #### +LD_SY 32*SIZE(ptrba), yvec0; +LD_SY 40*SIZE(ptrba), yvec1; +BROAD_SY 4*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +MUL_SY yvec1, yvec2, yvec7; +ADD1_SY yvec7, yvec14, yvec14; + +BROAD_SY 5*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +VPERMILP_SY $0xb1, yvec1, yvec5; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +MUL_SY yvec5, yvec3, yvec7; +ADD2_SY yvec7, yvec14, yvec14; + +#### Unroll 4 #### +LD_SY 48*SIZE(ptrba), yvec0; +LD_SY 56*SIZE(ptrba), yvec1; +BROAD_SY 6*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +MUL_SY yvec1, yvec2, yvec7; +ADD1_SY yvec7, yvec14, yvec14; + +BROAD_SY 7*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +VPERMILP_SY $0xb1, yvec1, yvec5; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +MUL_SY yvec5, yvec3, yvec7; +ADD2_SY yvec7, yvec14, yvec14; +ADDQ $64*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L311_bodyB; +ALIGN_5 +.L311_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L312_loopE; +ALIGN_5 +.L312_bodyB: +#### Unroll 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +LD_SY 8*SIZE(ptrba), yvec1; +BROAD_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +MUL_SY yvec1, yvec2, yvec7; +ADD1_SY yvec7, yvec14, yvec14; + +BROAD_SY 1*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +VPERMILP_SY $0xb1, yvec1, yvec5; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +MUL_SY yvec5, yvec3, yvec7; +ADD2_SY yvec7, yvec14, yvec14; + +#### Unroll 2 #### +LD_SY 16*SIZE(ptrba), yvec0; +LD_SY 24*SIZE(ptrba), yvec1; +BROAD_SY 2*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +MUL_SY yvec1, yvec2, yvec7; +ADD1_SY yvec7, yvec14, yvec14; + +BROAD_SY 3*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +VPERMILP_SY $0xb1, yvec1, yvec5; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +MUL_SY yvec5, yvec3, yvec7; +ADD2_SY yvec7, yvec14, yvec14; +ADDQ $32*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L312_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L313_loopE; +ALIGN_5 +.L313_bodyB: +#### Unroll 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +LD_SY 8*SIZE(ptrba), yvec1; +BROAD_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; +MUL_SY yvec1, yvec2, yvec7; +ADD1_SY yvec7, yvec14, yvec14; + +BROAD_SY 1*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +VPERMILP_SY $0xb1, yvec1, yvec5; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +MUL_SY yvec5, yvec3, yvec7; +ADD2_SY yvec7, yvec14, yvec14; +ADDQ $16*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L313_loopE: +#### Handle #### +XOR_SY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_SY yvec15, yvec7, yvec15; +ADDSUB_SY yvec14, yvec7, yvec14; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_SY yvec15, yvec7, yvec15; +SUB_SY yvec14, yvec7, yvec14; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_SY $0xb1, yvec15, yvec15; +VPERMILP_SY $0xb1, yvec14, yvec14; +ADDSUB_SY yvec15, yvec7, yvec15; +ADDSUB_SY yvec14, yvec7, yvec14; +VPERMILP_SY $0xb1, yvec15, yvec15; +VPERMILP_SY $0xb1, yvec14, yvec14; +#endif +##### Load Alpha #### +BROAD_SY MEMALPHA_R,yvec7; +BROAD_SY MEMALPHA_I,yvec6; +##### Multiply Alpha #### +VPERMILP_SY $0xb1,yvec15, yvec5; +MUL_SY yvec15, yvec7, yvec15; +MUL_SY yvec5, yvec6, yvec5; +ADDSUB_SY yvec5, yvec15, yvec15; +VPERMILP_SY $0xb1,yvec14, yvec4; +MUL_SY yvec14, yvec7, yvec14; +MUL_SY yvec4, yvec6, yvec4; +ADDSUB_SY yvec4, yvec14, yvec14; +#### Writing back #### +EXTRA_SY $1, yvec15, xvec7; +EXTRA_SY $1, yvec14, xvec6; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C0), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; +ADD_SX xvec2, xvec14, xvec14; +ADD_SX xvec3, xvec6, xvec6; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C0); +STL_SX xvec7, 4*SIZE(C0); +STH_SX xvec7, 6*SIZE(C0); +STL_SX xvec14, 8*SIZE(C0); +STH_SX xvec14, 10*SIZE(C0); +STL_SX xvec6, 12*SIZE(C0); +STH_SX xvec6, 14*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 8), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk; +#endif +ADDQ $16*SIZE, C0; +DECQ i; +JG .L31_bodyB; +ALIGN_5 +.L31_loopE: +TEST $4, bm; +JLE .L32_loopE; +ALIGN_5 +.L32_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L321_loopE; +ALIGN_5 +.L321_bodyB: +#### Unroll 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +BROAD_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; + +BROAD_SY 1*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; + +#### Unroll 2 #### +LD_SY 8*SIZE(ptrba), yvec0; +BROAD_SY 2*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; + +BROAD_SY 3*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; + +#### Unroll 3 #### +LD_SY 16*SIZE(ptrba), yvec0; +BROAD_SY 4*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; + +BROAD_SY 5*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; + +#### Unroll 4 #### +LD_SY 24*SIZE(ptrba), yvec0; +BROAD_SY 6*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; + +BROAD_SY 7*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +ADDQ $32*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L321_bodyB; +ALIGN_5 +.L321_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L322_loopE; +ALIGN_5 +.L322_bodyB: +#### Unroll 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +BROAD_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; + +BROAD_SY 1*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; + +#### Unroll 2 #### +LD_SY 8*SIZE(ptrba), yvec0; +BROAD_SY 2*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; + +BROAD_SY 3*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +ADDQ $16*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L322_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L323_loopE; +ALIGN_5 +.L323_bodyB: +#### Unroll 1 #### +LD_SY 0*SIZE(ptrba), yvec0; +BROAD_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec2, yvec6; +ADD1_SY yvec6, yvec15, yvec15; + +BROAD_SY 1*SIZE(ptrbb), yvec3; +VPERMILP_SY $0xb1, yvec0, yvec4; +MUL_SY yvec4, yvec3, yvec6; +ADD2_SY yvec6, yvec15, yvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L323_loopE: +#### Handle #### +XOR_SY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_SY yvec15, yvec7, yvec15; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_SY yvec15, yvec7, yvec15; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_SY $0xb1, yvec15, yvec15; +ADDSUB_SY yvec15, yvec7, yvec15; +VPERMILP_SY $0xb1, yvec15, yvec15; +#endif +##### Load Alpha #### +BROAD_SY MEMALPHA_R,yvec7; +BROAD_SY MEMALPHA_I,yvec6; +##### Multiply Alpha #### +VPERMILP_SY $0xb1,yvec15, yvec5; +MUL_SY yvec15, yvec7, yvec15; +MUL_SY yvec5, yvec6, yvec5; +ADDSUB_SY yvec5, yvec15, yvec15; +#### Writing back #### +EXTRA_SY $1, yvec15, xvec7; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C0); +STL_SX xvec7, 4*SIZE(C0); +STH_SX xvec7, 6*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif +ADDQ $8*SIZE, C0; + +.L32_loopE: +TEST $2, bm; +JLE .L33_loopE; +ALIGN_5 +.L33_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L331_loopE; +ALIGN_5 +.L331_bodyB: +#### Unroll 1 #### +LD_SX 0*SIZE(ptrba), xvec0; +BROAD_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +BROAD_SX 1*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +#### Unroll 2 #### +LD_SX 4*SIZE(ptrba), xvec0; +BROAD_SX 2*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +BROAD_SX 3*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +#### Unroll 3 #### +LD_SX 8*SIZE(ptrba), xvec0; +BROAD_SX 4*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +BROAD_SX 5*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +#### Unroll 4 #### +LD_SX 12*SIZE(ptrba), xvec0; +BROAD_SX 6*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +BROAD_SX 7*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +ADDQ $16*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L331_bodyB; +ALIGN_5 +.L331_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L332_loopE; +ALIGN_5 +.L332_bodyB: +#### Unroll 1 #### +LD_SX 0*SIZE(ptrba), xvec0; +BROAD_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +BROAD_SX 1*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; + +#### Unroll 2 #### +LD_SX 4*SIZE(ptrba), xvec0; +BROAD_SX 2*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +BROAD_SX 3*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L332_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L333_loopE; +ALIGN_5 +.L333_bodyB: +#### Unroll 1 #### +LD_SX 0*SIZE(ptrba), xvec0; +BROAD_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +BROAD_SX 1*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L333_loopE: +#### Handle #### +XOR_SY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +SHUF_SX $0xb1, xvec15, xvec15; +#endif +#### Mulitply Alpha #### +BROAD_SX MEMALPHA_R, xvec7; +BROAD_SX MEMALPHA_I, xvec6; +#### Writng back #### +VPERMILP_SX $0xb1,xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $4*SIZE, C0; + +.L33_loopE: +TEST $1, bm; +JLE .L34_loopE; +ALIGN_5 +.L34_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L341_loopE; +ALIGN_5 +.L341_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xa0, xvec2, xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +SHUF_SX $0xf5, xvec2, xvec4; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; + +LD_SX 4*SIZE(ptrba), xvec0; +LD_SX 4*SIZE(ptrbb), xvec2; +SHUF_SX $0xa0, xvec2, xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +SHUF_SX $0xf5, xvec2, xvec4; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L341_bodyB; +ALIGN_5 +.L341_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L342_loopE; +ALIGN_5 +.L342_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0xa0, xvec2, xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; +SHUF_SX $0xb1, xvec0, xvec1; +SHUF_SX $0xf5, xvec2, xvec4; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L342_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L343_loopE; +ALIGN_5 +.L343_bodyB: +XOR_SY yvec0, yvec0, yvec0; +XOR_SY yvec2, yvec2, yvec2; +LDL_SX 0*SIZE(ptrba), xvec0, xvec0; +LDL_SX 0*SIZE(ptrbb), xvec2, xvec2; +SHUF_SX $0xe0, xvec2, xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; +SHUF_SX $0xe1, xvec0, xvec1; +SHUF_SX $0xe5, xvec2, xvec4; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L343_loopE: +#### Handle #### +XOR_SY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_SX $0xb1, xvec15, xvec15; +ADDSUB_SX xvec15, xvec7, xvec7; +MOV_SX xvec7, xvec15; +SHUF_SX $0xb1, xvec15, xvec15; +#endif +BROAD_SX MEMALPHA_R, xvec7; +BROAD_SX MEMALPHA_I, xvec6; +VPERMILP_SX $0xb1, xvec15, xvec5; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; +SHUF_SX $0x44, xvec15, xvec14; +SHUF_SX $0xee, xvec15, xvec13; +ADD_SX xvec13, xvec14, xvec14; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec14, xvec14; +#endif +STL_SX xvec14, 0*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif +ADDQ $2*SIZE, C0; + +.L34_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $1, kk; +#endif + +MOVQ bk, k; +SALQ $3, k; +ADDQ k, bb; +ADDQ ldc, C; +.L30_loopE: +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; + +vzeroupper + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S new file mode 100644 index 000000000..f8a316b64 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -0,0 +1,1767 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define movapd movaps +#define movupd movups + +#define KERNEL1(xx) \ + vmovups -16 * SIZE(AO, %rax, 4),%xmm0 ;\ + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup -16 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -15 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup -14 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm8,%xmm0,%xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm2,%xmm1,%xmm12 ;\ + vmovddup -13 * SIZE(BO, %rax, 4), %xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 4),%xmm6 ;\ + vfmaddpd %xmm9,%xmm0,%xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2,%xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm0,%xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm0,%xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ + +#define KERNEL2(xx) \ + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup -10 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup -9 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8, %xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9, %xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm6, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ + +#define KERNEL3(xx) \ + vmovups -8 * SIZE(AO, %rax, 4),%xmm0 ;\ + vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup -8 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup -5 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1, %xmm8 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 4),%xmm6 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm0, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm0, %xmm7, %xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ + +#define KERNEL4(xx) \ + vmovddup -4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup -1 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1 ,%xmm12;\ + vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm6, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ + +#define KERNEL5(xx) \ + vmovups (AO, %rax, 4), %xmm0 ;\ + vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup (BO, %rax, 4), %xmm1 ;\ + vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup 2 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup 3 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovups 4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups 6 * SIZE(AO, %rax, 4),%xmm6 ;\ + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm0, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm0, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ + +#define KERNEL6(xx) \ + vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup 6 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup 7 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm6, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ + +#define KERNEL7(xx) \ + vmovups 8 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup 8 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup 11 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups 14 * SIZE(AO, %rax, 4), %xmm6 ;\ + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm0, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm0, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ + +#define KERNEL8(xx) \ + vmovddup 12 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup 15 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13, %xmm6, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + vmovups -16 * SIZE(AO),%xmm0 ;\ + vmovups -14 * SIZE(AO),%xmm2 ;\ + vmovddup -16 * SIZE(BO), %xmm1 ;\ + vmovddup -15 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm12, %xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup -14 * SIZE(BO), %xmm1 ;\ + vmovddup -13 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10, %xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11, %xmm0, %xmm3,%xmm11 ;\ + vfmaddpd %xmm14, %xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm15, %xmm2, %xmm3,%xmm15 ;\ + + +#define KERNEL_SUB2(xx) \ + vmovups -12 * SIZE(AO), %xmm0 ;\ + vmovups -10 * SIZE(AO), %xmm2 ;\ + vmovddup -12 * SIZE(BO), %xmm1 ;\ + vmovddup -11 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -10 * SIZE(BO), %xmm1 ;\ + vmovddup -9 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + +#define KERNEL_SUB3(xx) \ + vmovups -8 * SIZE(AO),%xmm0 ;\ + vmovups -6 * SIZE(AO),%xmm2 ;\ + vmovddup -8 * SIZE(BO), %xmm1 ;\ + vmovddup -7 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -6 * SIZE(BO), %xmm1 ;\ + vmovddup -5 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + +#define KERNEL_SUB4(xx) \ + vmovups -4 * SIZE(AO), %xmm0 ;\ + vmovups -2 * SIZE(AO), %xmm2 ;\ + vmovddup -4 * SIZE(BO), %xmm1 ;\ + vmovddup -3 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -2 * SIZE(BO), %xmm1 ;\ + vmovddup -1 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups (AO), %xmm0 ;\ + vmovddup (BO), %xmm1 ;\ + vmovddup 1 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC # LDC << 3 # LDC * 8 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) # j = n / 4 + jle .L40 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 2), CO2 # coffset2 = c + ldc + + leaq (C, LDC, 4), C # c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + + .align 16 +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + vxorpd %xmm8, %xmm8,%xmm8 + vxorpd %xmm9, %xmm9,%xmm9 + vxorpd %xmm10, %xmm10,%xmm10 + vxorpd %xmm11, %xmm11,%xmm11 + vxorpd %xmm12, %xmm12,%xmm12 + vxorpd %xmm13, %xmm13,%xmm13 + vxorpd %xmm14, %xmm14,%xmm14 + vxorpd %xmm15, %xmm15,%xmm15 + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC) + prefetcht0 (CO2) + prefetcht0 (CO2,LDC) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + // ALIGN_4 + + .align 16 + +#define PR1 16 +#define PR2 24 + +.L12: + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + vmovups -16 * SIZE(AO, %rax, 4), %xmm0 + vmovups -14 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -16 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -15 * SIZE(BO, %rax, 4), %xmm3 + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 +/* + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 + vmovaps %xmm0, %xmm2 +*/ + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + // prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12,%xmm12 + .align 2 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm7, %xmm13,%xmm13 + .align 2 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm14,%xmm14 + .align 2 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + vfmaddpd 2 * SIZE(CO2, LDC),%xmm7, %xmm15,%xmm15 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm13,%xmm13 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm14,%xmm14 + vmulpd %xmm7, %xmm11,%xmm11 + vmulpd %xmm7, %xmm15,%xmm15 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + vmovups %xmm9, (CO1, LDC) + vmovups %xmm13, 2 * SIZE(CO1, LDC) + vmovups %xmm10, (CO2) + vmovups %xmm14, 2 * SIZE(CO2) + vmovups %xmm11, (CO2, LDC) + vmovups %xmm15, 2 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9 ,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10, %xmm10 + vmovddup -15 * SIZE(BO), %xmm5 + vxorps %xmm11, %xmm11, %xmm11 + vmovddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -9 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovddup (BO, %rax, 4), %xmm1 + vmovddup -7 * SIZE(BO, %rax, 4), %xmm5 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -6 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -5 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -3 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -2 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -1 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovddup 8 * SIZE(BO, %rax, 4), %xmm3 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm11,%xmm11 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO1, LDC) + + vmovups %xmm10, (CO2) + vmovups %xmm11, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9, %xmm9 + vmovddup -15 * SIZE(AO), %xmm4 + vxorps %xmm10, %xmm10,%xmm10 + vmovups -16 * SIZE(BO), %xmm1 + vxorps %xmm11, %xmm11,%xmm11 + vmovups -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + vfmaddpd %xmm10,%xmm4, %xmm1,%xmm10 + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 4), %xmm4,%xmm11 + vmovups (BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm4 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,-6 * SIZE(BO, %rax, 4), %xmm2,%xmm9 + vmovups -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -13 * SIZE(AO, %rax, 1), %xmm2 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,-2 * SIZE(BO, %rax, 4), %xmm2,%xmm11 + vmovups 8 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO1, LDC), %xmm0,%xmm0 + vmovsd (CO2), %xmm1 + vmovhpd (CO2, LDC), %xmm1,%xmm1 + + + vfmaddpd %xmm0, %xmm7,%xmm8,%xmm8 + vfmaddpd %xmm1, %xmm7,%xmm9,%xmm9 +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO1, LDC) + vmovsd %xmm9, (CO2) + vmovhpd %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: # N % 4 + testq $3, N # N % 4 == 3 + je .L999 # Jump to end if N % 4 == 0 + + testq $2, N # N % 4 == 2 + je .L80 + ALIGN_4 + +.L41: # N % 4 > 1 +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax # k << 4 + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm1 + vmovddup -15 * SIZE(BO), %xmm5 + vmovddup -12 * SIZE(BO), %xmm3 + vxorps %xmm8, %xmm8,%xmm8 + vxorps %xmm9, %xmm9,%xmm9 + vxorps %xmm12, %xmm12,%xmm12 + vxorps %xmm13, %xmm13,%xmm13 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -8 * SIZE(AO), %xmm4 + vmovups %xmm0, %xmm2 + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: # Loop for (N % 4) == 2 + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovups -10 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vmovups (AO, %rax, 4), %xmm0 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm5 + vmovups -6 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm5 + vmovups -2 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovups 8 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -4 * SIZE(BO, %rax, 2), %xmm3 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#ifndef TRMMKERNEL + vfmaddpd (CO1),%xmm7, %xmm8, %xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12, %xmm12 + vfmaddpd (CO2),%xmm7, %xmm9, %xmm9 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm13, %xmm13 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm13,%xmm13 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + + vmovups %xmm9, (CO2) + vmovups %xmm13, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -15 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm8,%xmm2, %xmm1,%xmm8 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm2, %xmm3,%xmm9 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm2, %xmm1,%xmm10 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm2, %xmm3,%xmm11 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO2),%xmm7, %xmm9,%xmm9 + +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(AO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(BO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO2), %xmm0,%xmm0 +#endif + + vmulpd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddpd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N # N % 4 == 1 + je .L999 # Jump to end if N % 4 == 0 + ALIGN_4 + +.L81: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -8 * SIZE(AO), %xmm2 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm12, %xmm12,%xmm12 + vmovddup -14 * SIZE(BO), %xmm3 + vxorps %xmm13, %xmm13,%xmm13 + vmovddup -15 * SIZE(BO), %xmm5 + + // prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovapd -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm1 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vfmaddpd %xmm13,-10 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd (AO, %rax, 4), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 1), %xmm5 + vfmaddpd %xmm8,%xmm3, %xmm2,%xmm8 + vfmaddpd %xmm12,-6 * SIZE(AO, %rax, 4), %xmm3,%xmm12 + vmovapd -4 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm3 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vfmaddpd %xmm13,-2 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd 8 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm13, %xmm12,%xmm12 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7,%xmm12,%xmm12 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(BO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(BO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(AO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(AO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(AO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + vmovddup -16 * SIZE(BO, %rax, 1), %xmm0 + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 +#else + vmulpd %xmm7, %xmm8,%xmm8 + +#endif + + vmovups %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + movups -14 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 1), %xmm0,%xmm8 + vmovups -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 1), %xmm1,%xmm9 + vmovups -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + vmulsd -16 * SIZE(BO, %rax, 1), %xmm0,%xmm0 + vaddsd %xmm0, %xmm8,%xmm8 + vmovsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + vaddpd %xmm9, %xmm8,%xmm8 + vhaddpd %xmm8, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 +#endif + + vmulsd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddsd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S new file mode 100644 index 000000000..3b1b2560e --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -0,0 +1,3173 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define old_bm %rdi +#define old_bn %rsi +#define old_bk %rdx + +#define bm %r13 +#define bn %r14 +#define bk %r15 + +#define ALPHA %xmm0 +#define ba %rcx +#define bb %r8 +#define C %r9 +#define ldc %r10 + +#define i %r11 +#define k %rax + +#define ptrba %rdi +#define ptrbb %rsi +#define C0 %rbx +#define C1 %rbp + +#define prebb %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define old_ldc 8+STACKSIZE(%rsp) +#define old_offset 16+STACKSIZE(%rsp) +#define MEMALPHA 48(%rsp) +#define j 56(%rsp) +#define OFFSET 64(%rsp) +#define kk 72(%rsp) +#define kkk 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define old_ldc 64 + STACKSIZE(%rsp) +#define old_offset 72 + STACKSIZE(%rsp) + +#define MEMALPHA 224(%rsp) +#define j 232(%rsp) +#define OFFSET 240(%rsp) +#define kk 248(%rsp) +#define kkk 256(%rsp) + +#endif + +#define PREFETCH0 prefetcht0 +#define PREFETCH1 prefetcht0 +#define PREFETCH2 prefetcht2 + +#define xvec0 %xmm0 +#define xvec1 %xmm1 +#define xvec2 %xmm2 +#define xvec3 %xmm3 +#define xvec4 %xmm4 +#define xvec5 %xmm5 +#define xvec6 %xmm6 +#define xvec7 %xmm7 +#define xvec8 %xmm8 +#define xvec9 %xmm9 +#define xvec10 %xmm10 +#define xvec11 %xmm11 +#define xvec12 %xmm12 +#define xvec13 %xmm13 +#define xvec14 %xmm14 +#define xvec15 %xmm15 + +#define yvec0 %ymm0 +#define yvec1 %ymm1 +#define yvec2 %ymm2 +#define yvec3 %ymm3 +#define yvec4 %ymm4 +#define yvec5 %ymm5 +#define yvec6 %ymm6 +#define yvec7 %ymm7 +#define yvec8 %ymm8 +#define yvec9 %ymm9 +#define yvec10 %ymm10 +#define yvec11 %ymm11 +#define yvec12 %ymm12 +#define yvec13 %ymm13 +#define yvec14 %ymm14 +#define yvec15 %ymm15 + +#define LEAQ leaq +#define ADDQ addq +#define MULQ imulq +#define SARQ sarq +#define SALQ salq +#define ANDQ andq +#define SUBQ subq +#define DECQ decq +#define JG jg +#define JLE jle +#define TEST testq +#define OR orq +#define JNE jne +#define NOP +#define XOR xorpd +#undef MOVQ +#define MOVQ movq + +#define XOR_DY vxorpd +#define XOR_DX vxorpd + +#define LD_DY vmovapd +#define LD_DX vmovapd +#define LDL_DX vmovlpd +#define LDL_DY vmovlpd +#define LDH_DX vmovhpd +#define LDH_DY vmovhpd + +#define ST_DY vmovapd +#define ST_DX vmovapd +#define STL_DX vmovlpd +#define STL_DY vmovlpd +#define STH_DX vmovhpd +#define STH_DY vmovhpd + +#define EDUP_DY vmovddup + +#define ADD_DY vaddpd +#define ADD_DX vaddpd + +#define ADD1_DY vaddpd +#define ADD2_DY vaddpd +#define ADDSUB_DY vaddsubpd + +#define MUL_DY vmulpd +#define MUL_DX vmulpd + +#define SHUF_DY vperm2f128 +#define SHUF_DX vpshufd + +#define VPERMILP_DY vpermilpd + +#define BROAD_DY vbroadcastsd +#define BROAD_DX vmovddup + +#define MOV_DY vmovapd +#define MOV_DX vmovapd + +#define REVS_DY vshufpd +#define REVS_DX vmovsd + +#define EXTRA_DY vextractf128 + +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, old_bm + movq ARG2, old_bn + movq ARG3, old_bk + movq OLD_A, ba + movq OLD_B, bb + movq OLD_C, C + movq old_ldc, ldc +#ifdef TRMMKERNEL + movq old_offset, %r11 +#endif + movaps %xmm3, %xmm0 +#else + +movq old_ldc, ldc +#ifdef TRMMKERNEL +movq old_offset, %r11 +#endif +#endif + +vzeroupper + +vmovlps ALPHA, MEMALPHA +movq old_bm, bm +movq old_bn, bn +movq old_bk, bk +leaq (, ldc, SIZE), ldc +#ifdef TRMMKERNEL +movq %r11, OFFSET +#ifndef LEFT +negq %r11; +#endif +movq %r11, kk +#endif + +MOVQ bn,j; +SARQ $2,j; # Rn = 4 +JLE .L0_loopE; +ALIGN_5; +.L0_bodyB:; +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif + +MOVQ C,C0; +LEAQ (C,ldc,2),C1; +MOVQ bk, k; +SALQ $5, k; +LEAQ (bb, k, 1), prebb; +MOVQ ba,ptrba; +MOVQ bm,i; +SARQ $3,i; # Rm = 8 +JLE .L1_loopE; +ALIGN_5; +.L1_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#### Initial Results Register #### +PREFETCH2 0*SIZE(prebb); +XOR_DY yvec15, yvec15, yvec15; +PREFETCH2 8*SIZE(prebb); +XOR_DY yvec14, yvec14, yvec14; +XOR_DY yvec13, yvec13, yvec13; +ADDQ $16*SIZE, prebb +XOR_DY yvec12, yvec12, yvec12; +PREFETCH0 3*SIZE(C0) +LD_DY 0*SIZE(ptrbb), yvec2; +PREFETCH0 3*SIZE(C0, ldc, 1) +XOR_DY yvec11, yvec11, yvec11; +PREFETCH0 3*SIZE(C1) +XOR_DY yvec10, yvec10, yvec10; +PREFETCH0 3*SIZE(C1, ldc, 1) +LD_DY 0*SIZE(ptrba), yvec0; +XOR_DY yvec9, yvec9, yvec9; +XOR_DY yvec8, yvec8, yvec8; +VPERMILP_DY $0x05, yvec2, yvec3; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2,k; +JLE .L2_loopE; +ALIGN_5; +.L2_bodyB:; +# Computing kernel + +#### Unroll times 1 #### +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADD_DY yvec15, yvec6, yvec15; +ADD_DY yvec13, yvec7, yvec13; + +PREFETCH0 64*SIZE(ptrba) +MUL_DY yvec1, yvec2, yvec6; +LD_DY 4*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +VPERMILP_DY $0x05, yvec2, yvec3; +ADD_DY yvec14, yvec6, yvec14; +ADD_DY yvec12, yvec7, yvec12; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 8*SIZE(ptrba), yvec0; +ADD_DY yvec11, yvec6, yvec11; +ADD_DY yvec9, yvec7, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec10, yvec6, yvec10; +ADD_DY yvec8, yvec7, yvec8; + +#### Unroll times 2 #### +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADD_DY yvec15, yvec6, yvec15; +ADD_DY yvec13, yvec7, yvec13; + +PREFETCH0 72*SIZE(ptrba) +MUL_DY yvec1, yvec2, yvec6; +LD_DY 8*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +VPERMILP_DY $0x05, yvec2, yvec3; +ADD_DY yvec14, yvec6, yvec14; +ADD_DY yvec12, yvec7, yvec12; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 16*SIZE(ptrba), yvec0; +ADD_DY yvec11, yvec6, yvec11; +ADD_DY yvec9, yvec7, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec10, yvec6, yvec10; +ADD_DY yvec8, yvec7, yvec8; + +#### Unroll times 3 #### +LD_DY 20*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADD_DY yvec15, yvec6, yvec15; +ADD_DY yvec13, yvec7, yvec13; + +PREFETCH0 80*SIZE(ptrba) +MUL_DY yvec1, yvec2, yvec6; +LD_DY 12*SIZE(ptrbb), yvec2; +ADDQ $16*SIZE, ptrbb; +MUL_DY yvec1, yvec3, yvec7; +VPERMILP_DY $0x05, yvec2, yvec3; +ADD_DY yvec14, yvec6, yvec14; +ADD_DY yvec12, yvec7, yvec12; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 24*SIZE(ptrba), yvec0; +ADD_DY yvec11, yvec6, yvec11; +ADD_DY yvec9, yvec7, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec10, yvec6, yvec10; +ADD_DY yvec8, yvec7, yvec8; + +#### Unroll times 4 #### +LD_DY 28*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADDQ $32*SIZE, ptrba; +ADD_DY yvec15, yvec6, yvec15; +ADD_DY yvec13, yvec7, yvec13; + +PREFETCH0 88*SIZE(ptrba) +MUL_DY yvec1, yvec2, yvec6; +LD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +VPERMILP_DY $0x05, yvec2, yvec3; +ADD_DY yvec14, yvec6, yvec14; +ADD_DY yvec12, yvec7, yvec12; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 0*SIZE(ptrba), yvec0; +ADD_DY yvec11, yvec6, yvec11; +ADD_DY yvec9, yvec7, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec10, yvec6, yvec10; +ADD_DY yvec8, yvec7, yvec8; +.L2_bodyE:; +DECQ k; +JG .L2_bodyB; +ALIGN_5 +.L2_loopE:; +PREFETCH2 0*SIZE(prebb); +ADDQ $8*SIZE, prebb; +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L3_loopE; +ALIGN_5 +.L3_bodyB: +#### Unroll times 1 #### +PREFETCH0 64*SIZE(ptrba) +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADD_DY yvec15, yvec6, yvec15; +ADD_DY yvec13, yvec7, yvec13; + +MUL_DY yvec1, yvec2, yvec6; +LD_DY 4*SIZE(ptrbb), yvec2; +ADDQ $8*SIZE, ptrbb; +MUL_DY yvec1, yvec3, yvec7; +VPERMILP_DY $0x05, yvec2, yvec3; +ADD_DY yvec14, yvec6, yvec14; +ADD_DY yvec12, yvec7, yvec12; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 8*SIZE(ptrba), yvec0; +ADD_DY yvec11, yvec6, yvec11; +ADD_DY yvec9, yvec7, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec10, yvec6, yvec10; +ADD_DY yvec8, yvec7, yvec8; + +#### Unroll times 2 #### +PREFETCH0 72*SIZE(ptrba) +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADDQ $16*SIZE, ptrba; +ADD_DY yvec15, yvec6, yvec15; +ADD_DY yvec13, yvec7, yvec13; + +MUL_DY yvec1, yvec2, yvec6; +LD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +VPERMILP_DY $0x05, yvec2, yvec3; +ADD_DY yvec14, yvec6, yvec14; +ADD_DY yvec12, yvec7, yvec12; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 0*SIZE(ptrba), yvec0; +ADD_DY yvec11, yvec6, yvec11; +ADD_DY yvec9, yvec7, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec10, yvec6, yvec10; +ADD_DY yvec8, yvec7, yvec8; + +.L3_loopE: +PREFETCH2 0*SIZE(prebb); +ADDQ $8*SIZE, prebb +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L4_loopE; +ALIGN_5 +.L4_bodyB:; +#### Unroll times 1 #### +PREFETCH0 64*SIZE(ptrba) +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADDQ $8*SIZE, ptrba; +ADD_DY yvec15, yvec6, yvec15; +ADD_DY yvec13, yvec7, yvec13; + +MUL_DY yvec1, yvec2, yvec6; +MUL_DY yvec1, yvec3, yvec7; +ADDQ $4*SIZE, ptrbb; +ADD_DY yvec14, yvec6, yvec14; +ADD_DY yvec12, yvec7, yvec12; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +ADD_DY yvec11, yvec6, yvec11; +ADD_DY yvec9, yvec7, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec10, yvec6, yvec10; +ADD_DY yvec8, yvec7, yvec8; + +.L4_loopE:; +#### Load Alpha #### +BROAD_DY MEMALPHA,yvec7; +#### Multiply Alpha #### +MUL_DY yvec7,yvec15,yvec15; +MUL_DY yvec7,yvec14,yvec14; +MUL_DY yvec7,yvec13,yvec13; +MUL_DY yvec7,yvec12,yvec12; +MUL_DY yvec7,yvec11,yvec11; +MUL_DY yvec7,yvec10,yvec10; +MUL_DY yvec7,yvec9,yvec9; +MUL_DY yvec7,yvec8,yvec8; +#### Reverse the Results #### +MOV_DY yvec15,yvec7; +REVS_DY $0x0a,yvec13,yvec15,yvec15; +REVS_DY $0x0a,yvec7,yvec13,yvec13; +MOV_DY yvec14,yvec7; +REVS_DY $0x0a,yvec12,yvec14,yvec14; +REVS_DY $0x0a,yvec7,yvec12,yvec12; +MOV_DY yvec11,yvec7; +REVS_DY $0x0a,yvec9,yvec11,yvec11; +REVS_DY $0x0a,yvec7,yvec9,yvec9; +MOV_DY yvec10,yvec7; +REVS_DY $0x0a,yvec8,yvec10,yvec10; +REVS_DY $0x0a,yvec7,yvec8,yvec8; +#### Testing alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L4_loopEx; # Unalign part write back +ALIGN_5 +#### Writing Back #### +EXTRA_DY $1,yvec15,xvec7; +EXTRA_DY $1,yvec14,xvec6; +EXTRA_DY $1,yvec13,xvec5; +EXTRA_DY $1,yvec12,xvec4; +EXTRA_DY $1,yvec11,xvec3; +EXTRA_DY $1,yvec10,xvec2; +EXTRA_DY $1,yvec9,xvec1; +EXTRA_DY $1,yvec8,xvec0; +#ifndef TRMMKERNEL +ADD_DY 0*SIZE(C0),xvec15,xvec15; +ADD_DY 2*SIZE(C1),xvec7,xvec7; +ADD_DY 4*SIZE(C0),xvec14,xvec14; +ADD_DY 6*SIZE(C1),xvec6,xvec6; +ADD_DY 0*SIZE(C0,ldc,1),xvec13,xvec13; +ADD_DY 2*SIZE(C1,ldc,1),xvec5,xvec5; +ADD_DY 4*SIZE(C0,ldc,1),xvec12,xvec12; +ADD_DY 6*SIZE(C1,ldc,1),xvec4,xvec4; +ADD_DY 0*SIZE(C1),xvec11,xvec11; +ADD_DY 2*SIZE(C0),xvec3,xvec3; +ADD_DY 4*SIZE(C1),xvec10,xvec10; +ADD_DY 6*SIZE(C0),xvec2,xvec2; +ADD_DY 0*SIZE(C1,ldc,1),xvec9,xvec9; +ADD_DY 2*SIZE(C0,ldc,1),xvec1,xvec1; +ADD_DY 4*SIZE(C1,ldc,1),xvec8,xvec8; +ADD_DY 6*SIZE(C0,ldc,1),xvec0,xvec0; +#endif +ST_DY xvec15, 0*SIZE(C0); +ST_DY xvec7, 2*SIZE(C1); +ST_DY xvec14, 4*SIZE(C0); +ST_DY xvec6, 6*SIZE(C1); +ST_DY xvec13, 0*SIZE(C0,ldc,1); +ST_DY xvec5, 2*SIZE(C1,ldc,1); +ST_DY xvec12, 4*SIZE(C0,ldc,1); +ST_DY xvec4, 6*SIZE(C1,ldc,1); +ST_DY xvec11, 0*SIZE(C1); +ST_DY xvec3, 2*SIZE(C0); +ST_DY xvec10, 4*SIZE(C1); +ST_DY xvec2, 6*SIZE(C0); +ST_DY xvec9, 0*SIZE(C1,ldc,1); +ST_DY xvec1, 2*SIZE(C0,ldc,1); +ST_DY xvec8, 4*SIZE(C1,ldc,1); +ST_DY xvec0, 6*SIZE(C0,ldc,1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif +ADDQ $8*SIZE,C0; +ADDQ $8*SIZE,C1; +.L1_bodyE:; +DECQ i; +JG .L1_bodyB; +JMP .L1_loopE; +ALIGN_5; +.L4_loopEx:; +EXTRA_DY $1, yvec15, xvec7; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C0), xvec6, xvec6; +LDH_DY 1*SIZE(C0), xvec6, xvec6; +ADD_DY xvec6, xvec15, xvec15; +LDL_DY 2*SIZE(C1), xvec5, xvec5; +LDH_DY 3*SIZE(C1), xvec5, xvec5; +ADD_DY xvec5, xvec7, xvec7; +#endif +STL_DY xvec15, 0*SIZE(C0); +STH_DY xvec15, 1*SIZE(C0); +STL_DY xvec7, 2*SIZE(C1); +STH_DY xvec7, 3*SIZE(C1); + +EXTRA_DY $1, yvec14, xvec4; +#ifndef TRMMKERNEL +LDL_DY 4*SIZE(C0), xvec3, xvec3; +LDH_DY 5*SIZE(C0), xvec3, xvec3; +ADD_DY xvec3, xvec14, xvec14; +LDL_DY 6*SIZE(C1), xvec2, xvec2; +LDH_DY 7*SIZE(C1), xvec2, xvec2; +ADD_DY xvec2, xvec4, xvec4; +#endif +STL_DY xvec14, 4*SIZE(C0); +STH_DY xvec14, 5*SIZE(C0); +STL_DY xvec4, 6*SIZE(C1); +STH_DY xvec4, 7*SIZE(C1); + +EXTRA_DY $1, yvec13, xvec7; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_DY 1*SIZE(C0, ldc, 1), xvec6, xvec6; +ADD_DY xvec6, xvec13, xvec13; +LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5; +LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5; +ADD_DY xvec5, xvec7, xvec7; +#endif +STL_DY xvec13, 0*SIZE(C0, ldc, 1); +STH_DY xvec13, 1*SIZE(C0, ldc, 1); +STL_DY xvec7, 2*SIZE(C1, ldc, 1); +STH_DY xvec7, 3*SIZE(C1, ldc, 1); + +EXTRA_DY $1, yvec12, xvec4; +#ifndef TRMMKERNEL +LDL_DY 4*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_DY 5*SIZE(C0, ldc, 1), xvec3, xvec3; +ADD_DY xvec3, xvec12, xvec12; +LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2; +ADD_DY xvec2, xvec4, xvec4; +#endif +STL_DY xvec12, 4*SIZE(C0, ldc, 1); +STH_DY xvec12, 5*SIZE(C0, ldc ,1); +STL_DY xvec4, 6*SIZE(C1, ldc, 1); +STH_DY xvec4, 7*SIZE(C1, ldc, 1); + +EXTRA_DY $1, yvec11, xvec7; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C1), xvec6, xvec6; +LDH_DY 1*SIZE(C1), xvec6, xvec6; +ADD_DY xvec6, xvec11, xvec11; +LDL_DY 2*SIZE(C0), xvec5, xvec5; +LDH_DY 3*SIZE(C0), xvec5, xvec5; +ADD_DY xvec5, xvec7, xvec7; +#endif +STL_DY xvec11, 0*SIZE(C1); +STH_DY xvec11, 1*SIZE(C1); +STL_DY xvec7, 2*SIZE(C0); +STH_DY xvec7, 3*SIZE(C0); + +EXTRA_DY $1, yvec10, xvec4; +#ifndef TRMMKERNEL +LDL_DY 4*SIZE(C1), xvec3, xvec3; +LDH_DY 5*SIZE(C1), xvec3, xvec3; +ADD_DY xvec3, xvec10, xvec10; +LDL_DY 6*SIZE(C0), xvec2, xvec2; +LDH_DY 7*SIZE(C0), xvec2, xvec2; +ADD_DY xvec2, xvec4, xvec4; +#endif +STL_DY xvec10, 4*SIZE(C1); +STH_DY xvec10, 5*SIZE(C1); +STL_DY xvec4, 6*SIZE(C0); +STH_DY xvec4, 7*SIZE(C0); + +EXTRA_DY $1, yvec9, xvec7; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_DY 1*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DY xvec6, xvec9, xvec9; +LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5; +LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5; +ADD_DY xvec5, xvec7, xvec7; +#endif +STL_DY xvec9, 0*SIZE(C1, ldc, 1); +STH_DY xvec9, 1*SIZE(C1, ldc, 1); +STL_DY xvec7, 2*SIZE(C0, ldc, 1); +STH_DY xvec7, 3*SIZE(C0, ldc, 1); + +EXTRA_DY $1, yvec8, xvec4; +#ifndef TRMMKERNEL +LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3; +ADD_DY xvec3, xvec8, xvec8; +LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2; +ADD_DY xvec2, xvec4, xvec4; +#endif +STL_DY xvec8, 4*SIZE(C1, ldc, 1); +STH_DY xvec8, 5*SIZE(C1, ldc, 1); +STL_DY xvec4, 6*SIZE(C0, ldc, 1); +STH_DY xvec4, 7*SIZE(C0, ldc, 1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif + +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L1_bodyB; +ALIGN_5 +.L1_loopE:; +TEST $4, bm; # Rm = 4 +JLE .L5_loopE; +ALIGN_5 +.L5_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#### Initial Results Register #### +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec13, yvec13, yvec13; +LD_DY 0*SIZE(ptrbb), yvec2; +XOR_DY yvec11, yvec11, yvec11; +XOR_DY yvec9, yvec9, yvec9; +LD_DY 0*SIZE(ptrba), yvec0; +VPERMILP_DY $0x05, yvec2, yvec3; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L6_loopE; +ALIGN_5; +.L6_bodyB:; +# Computing kernel + +#### Untoll time 1 #### +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD_DY yvec13, yvec7, yvec13; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +LD_DY 4*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec4, yvec6; +ADD_DY yvec11, yvec6, yvec11; +VPERMILP_DY $0x05, yvec2, yvec3; +MUL_DY yvec0, yvec5, yvec7; +ADD_DY yvec9, yvec7, yvec9; + +#### Untoll time 2 #### +LD_DY 8*SIZE(ptrba), yvec0; +MUL_DY yvec1, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec1, yvec3, yvec7; +ADD_DY yvec13, yvec7, yvec13; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +LD_DY 8*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec4, yvec6; +ADD_DY yvec11, yvec6, yvec11; +VPERMILP_DY $0x05, yvec2, yvec3; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec9, yvec7, yvec9; + +#### Untoll time 3 #### +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +ADDQ $16*SIZE, ptrba; +MUL_DY yvec0, yvec3, yvec7; +ADD_DY yvec13, yvec7, yvec13; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +LD_DY 12*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec4, yvec6; +ADD_DY yvec11, yvec6, yvec11; +VPERMILP_DY $0x05, yvec2, yvec3; +ADDQ $16*SIZE, ptrbb; +MUL_DY yvec0, yvec5, yvec7; +ADD_DY yvec9, yvec7, yvec9; + +#### Untoll time 4 #### +LD_DY 0*SIZE(ptrba), yvec0; +MUL_DY yvec1, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec1, yvec3, yvec7; +ADD_DY yvec13, yvec7, yvec13; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +LD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec4, yvec6; +ADD_DY yvec11, yvec6, yvec11; +VPERMILP_DY $0x05, yvec2, yvec3; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec9, yvec7, yvec9; +DECQ k; +JG .L6_bodyB; +ALIGN_5 +.L6_loopE:; +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L7_loopE; +ALIGN_5 +.L7_bodyB:; +#### Untoll time 1 #### +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +ADDQ $8*SIZE, ptrba; +MUL_DY yvec0, yvec3, yvec7; +ADD_DY yvec13, yvec7, yvec13; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +LD_DY 4*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec4, yvec6; +ADD_DY yvec11, yvec6, yvec11; +VPERMILP_DY $0x05, yvec2, yvec3; +ADDQ $8*SIZE, ptrbb; +MUL_DY yvec0, yvec5, yvec7; +ADD_DY yvec9, yvec7, yvec9; + +#### Untoll time 2 #### +LD_DY 0*SIZE(ptrba), yvec0; +MUL_DY yvec1, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec1, yvec3, yvec7; +ADD_DY yvec13, yvec7, yvec13; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +LD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec4, yvec6; +ADD_DY yvec11, yvec6, yvec11; +VPERMILP_DY $0x05, yvec2, yvec3; +MUL_DY yvec1, yvec5, yvec7; +ADD_DY yvec9, yvec7, yvec9; + +.L7_loopE:; +#ifndef TRMMKERNEL +TEST $1, bk +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L8_loopE; +ALIGN_5 +.L8_bodyB:; +#### Untoll time 1 #### +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +ADDQ $4*SIZE, ptrba; +MUL_DY yvec0, yvec3, yvec7; +ADD_DY yvec13, yvec7, yvec13; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD_DY yvec11, yvec6, yvec11; +ADDQ $4*SIZE, ptrbb; +MUL_DY yvec0, yvec5, yvec7; +ADD_DY yvec9, yvec7, yvec9; + +.L8_loopE:; +#### Load Alpha #### +BROAD_DY MEMALPHA, yvec7; +#### Multiply Alpha #### +MUL_DY yvec7,yvec15,yvec15; +MUL_DY yvec7,yvec13,yvec13; +MUL_DY yvec7,yvec11,yvec11; +MUL_DY yvec7,yvec9,yvec9; +#### Reverse the Results #### +MOV_DY yvec15, yvec7; +REVS_DY $0x0a,yvec13,yvec15,yvec15; +REVS_DY $0x0a,yvec7,yvec13,yvec13; +MOV_DY yvec11,yvec7; +REVS_DY $0x0a,yvec9,yvec11,yvec11; +REVS_DY $0x0a,yvec7,yvec9,yvec9; +#### Testing alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L8_loopEx; # Unalign part write back +ALIGN_5 +#### Writing Back #### +EXTRA_DY $1,yvec15,xvec7; +EXTRA_DY $1,yvec13,xvec5; +EXTRA_DY $1,yvec11,xvec3; +EXTRA_DY $1,yvec9,xvec1; +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec3, xvec3; +ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9; +ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; +#endif +ST_DX xvec15, 0*SIZE(C0); +ST_DX xvec7, 2*SIZE(C1); +ST_DX xvec13, 0*SIZE(C0,ldc,1); +ST_DX xvec5, 2*SIZE(C1,ldc,1); +ST_DX xvec11, 0*SIZE(C1); +ST_DX xvec3, 2*SIZE(C0); +ST_DX xvec9, 0*SIZE(C1,ldc,1); +ST_DX xvec1, 2*SIZE(C0,ldc,1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +JMP .L5_loopE; +ALIGN_5 +.L8_loopEx:; +EXTRA_DY $1,yvec15,xvec7; +EXTRA_DY $1,yvec13,xvec5; +EXTRA_DY $1,yvec11,xvec3; +EXTRA_DY $1,yvec9,xvec1; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec15, xvec15; +ADD_DX xvec12, xvec13, xvec13; +ADD_DX xvec10, xvec11, xvec11; +ADD_DX xvec8, xvec9, xvec9; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec13, 0*SIZE(C0, ldc, 1); +STH_DX xvec13, 1*SIZE(C0, ldc, 1); +STL_DX xvec11, 0*SIZE(C1); +STH_DX xvec11, 1*SIZE(C1); +STL_DX xvec9, 0*SIZE(C1, ldc, 1); +STH_DX xvec9, 1*SIZE(C1, ldc, 1); +#ifndef TRMMKERNEL +LDL_DX 2*SIZE(C0), xvec0, xvec0; +LDH_DX 3*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec4, xvec4; +LDH_DX 3*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX xvec0, xvec3, xvec3; +ADD_DX xvec2, xvec1, xvec1; +ADD_DX xvec4, xvec7, xvec7; +ADD_DX xvec6, xvec5, xvec5; +#endif +STL_DX xvec3, 2*SIZE(C0); +STH_DX xvec3, 3*SIZE(C0); +STL_DX xvec1, 2*SIZE(C0, ldc, 1); +STH_DX xvec1, 3*SIZE(C0, ldc, 1); +STL_DX xvec7, 2*SIZE(C1); +STH_DX xvec7, 3*SIZE(C1); +STL_DX xvec5, 2*SIZE(C1, ldc, 1); +STH_DX xvec5, 3*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $4, kk +#endif + +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +.L5_loopE:; +TEST $2, bm; +JLE .L9_loopE; +ALIGN_5 +.L9_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb +#endif +#### Initial Results Register #### +LD_DX 0*SIZE(ptrbb), xvec2; +XOR_DY yvec15, yvec15, yvec15; +LD_DX 2*SIZE(ptrbb), xvec3; +XOR_DY yvec13, yvec13, yvec13; +LD_DX 0*SIZE(ptrba), xvec0; +XOR_DY yvec11, yvec11, yvec11; +SHUF_DX $0x4e, xvec2, xvec4; +XOR_DY yvec9, yvec9, yvec9; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L10_loopE; +ALIGN_5; +.L10_bodyB:; +# Computing kernel + +##### Unroll time 1 #### +LD_DX 4*SIZE(ptrbb), xvec6; +SHUF_DX $0x4e, xvec3, xvec5; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +LD_DX 6*SIZE(ptrbb), xvec7; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; + +LD_DX 2*SIZE(ptrba), xvec1; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; +SHUF_DX $0x4e, xvec6, xvec4; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; + +#### Unroll time 2 #### +LD_DX 8*SIZE(ptrbb), xvec2; +SHUF_DX $0x4e, xvec7, xvec5; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; + +LD_DX 10*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; + +LD_DX 4*SIZE(ptrba), xvec0; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; +SHUF_DX $0x4e, xvec2, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; + +##### Unroll time 3 #### +LD_DX 12*SIZE(ptrbb), xvec6; +SHUF_DX $0x4e, xvec3, xvec5; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +LD_DX 14*SIZE(ptrbb), xvec7; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; +ADDQ $16*SIZE, ptrbb; + +LD_DX 6*SIZE(ptrba), xvec1; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; +SHUF_DX $0x4e, xvec6, xvec4; +ADDQ $8*SIZE, ptrba; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; + +#### Unroll time 4 #### +LD_DX 0*SIZE(ptrbb), xvec2; +SHUF_DX $0x4e, xvec7, xvec5; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; + +LD_DX 2*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; + +LD_DX 0*SIZE(ptrba), xvec0; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; +SHUF_DX $0x4e, xvec2, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; +DECQ k; +JG .L10_bodyB; +ALIGN_5 +.L10_loopE:; +#ifndef TRMMKERNEL +TEST $2, bk +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L11_loopE; +ALIGN_5 +.L11_bodyB:; +##### Unroll time 1 #### +LD_DX 4*SIZE(ptrbb), xvec6; +SHUF_DX $0x4e, xvec3, xvec5; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +LD_DX 6*SIZE(ptrbb), xvec7; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; +ADDQ $8*SIZE, ptrbb; + +LD_DX 2*SIZE(ptrba), xvec1; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; +SHUF_DX $0x4e, xvec6, xvec4; +ADDQ $4*SIZE, ptrba; + +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; + +#### Unroll time 2 #### +LD_DX 0*SIZE(ptrbb), xvec2; +SHUF_DX $0x4e, xvec7, xvec5; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; + +LD_DX 2*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; + +LD_DX 0*SIZE(ptrba), xvec0; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; +SHUF_DX $0x4e, xvec2, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; + +.L11_loopE:; +#ifndef TRMMKERNEL +TEST $1, bk +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L12_loopE; +ALIGN_5 +.L12_bodyB:; +SHUF_DX $0x4e, xvec3, xvec5; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; +ADDQ $4*SIZE, ptrbb; + +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; +ADDQ $2*SIZE, ptrba; + +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; + +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; + +.L12_loopE:; +#### Load Alpha #### +BROAD_DX MEMALPHA, xvec7; +#### Multiply Alpha #### +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec9, xvec9; +#### Reverse the Results #### +MOV_DX xvec15, xvec6; +REVS_DX xvec13, xvec15, xvec15; +REVS_DX xvec6, xvec13, xvec13; +MOV_DX xvec11, xvec6; +REVS_DX xvec9, xvec11, xvec11; +REVS_DX xvec6, xvec9, xvec9; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L12_loopEx; +ALIGN_5 +#### Writing Back #### +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec13, xvec13; +ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15; +ADD_DX 0*SIZE(C1), xvec9, xvec9; +ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11; +#endif +ST_DX xvec13, 0*SIZE(C0); +ST_DX xvec15, 0*SIZE(C0, ldc, 1); +ST_DX xvec9, 0*SIZE(C1); +ST_DX xvec11, 0*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk +#endif +ADDQ $2*SIZE, C0 +ADDQ $2*SIZE, C1 +JMP .L9_loopE; +ALIGN_5 +.L12_loopEx: +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec13, xvec13; +ADD_DX xvec12, xvec15, xvec15; +ADD_DX xvec10, xvec9, xvec9; +ADD_DX xvec8, xvec11, xvec11; +#endif +STL_DX xvec13, 0*SIZE(C0); +STH_DX xvec13, 1*SIZE(C0); +STL_DX xvec15, 0*SIZE(C0, ldc, 1); +STH_DX xvec15, 1*SIZE(C0, ldc, 1); +STL_DX xvec9, 0*SIZE(C1); +STH_DX xvec9, 1*SIZE(C1); +STL_DX xvec11, 0*SIZE(C1, ldc, 1); +STH_DX xvec11, 1*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +.L9_loopE:; +TEST $1, bm +JLE .L13_loopE; +ALIGN_5 +.L13_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#### Initial Results Register #### +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L14_loopE; +ALIGN_5 +.L14_bodyB:; +BROAD_DY 0*SIZE(ptrba), yvec0; +LD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; + +BROAD_DY 1*SIZE(ptrba), yvec1; +LD_DY 4*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD_DY yvec15, yvec7, yvec15; + +BROAD_DY 2*SIZE(ptrba), yvec0; +LD_DY 8*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; + +BROAD_DY 3*SIZE(ptrba), yvec1; +LD_DY 12*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD_DY yvec15, yvec7, yvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L14_bodyB; +ALIGN_5 +.L14_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L15_loopE; +ALIGN_5 +.L15_bodyB: +BROAD_DY 0*SIZE(ptrba), yvec0; +LD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; + +BROAD_DY 1*SIZE(ptrba), yvec1; +LD_DY 4*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD_DY yvec15, yvec7, yvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +.L15_loopE:; +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L16_loopE; +ALIGN_5 +.L16_bodyB:; +BROAD_DY 0*SIZE(ptrba), yvec0; +LD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD_DY yvec15, yvec6, yvec15; +ADDQ $1*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L16_loopE: +#### Load Alpha #### +BROAD_DY MEMALPHA, yvec7; +#### Multiply Alpha #### +MUL_DY yvec15, yvec7, yvec15; +#### Writing Back #### +EXTRA_DY $1, yvec15, xvec7; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 0*SIZE(C0, ldc, 1); +STL_DX xvec7, 0*SIZE(C1); +STH_DX xvec7, 0*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $1, kk +#endif +ADDQ $1*SIZE, C0 +ADDQ $1*SIZE, C1 +.L13_loopE:; +#if defined(TRMMKERNEL)&&!defined(LEFT) +ADDQ $4, kk +#endif +MOVQ bk,k; +SALQ $5,k; +ADDQ k,bb; +LEAQ (C,ldc,4),C; +.L0_bodyE:; +DECQ j; +JG .L0_bodyB; +ALIGN_5; +.L0_loopE:; +TEST $2, bn; +JLE .L20_loopE; +ALIGN_5; +.L20_loopB:; +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk +#endif +MOVQ C, C0; +LEAQ (C, ldc, 1), C1; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $3, i; # Rm = 8 +JLE .L21_loopE; +ALIGN_5; +.L21_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#### Initial Results Register #### +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +XOR_DY yvec13, yvec13, yvec13; +XOR_DY yvec12, yvec12, yvec12; +XOR_DY yvec11, yvec11, yvec11; +XOR_DY yvec10, yvec10, yvec10; +XOR_DY yvec9, yvec9, yvec9; +XOR_DY yvec8, yvec8, yvec8; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L211_loopE; +ALIGN_5; +.L211_bodyB: +# Computing kernel +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec1; +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +LD_DX 4*SIZE(ptrba), xvec2; +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; + +LD_DX 6*SIZE(ptrba), xvec3; +SHUF_DX $0x4e, xvec7, xvec4; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; + +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; + +#### Unroll time 2 #### +LD_DX 8*SIZE(ptrba), xvec0; +LD_DX 2*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 10*SIZE(ptrba), xvec1; +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +LD_DX 12*SIZE(ptrba), xvec2; +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; + +LD_DX 14*SIZE(ptrba), xvec3; +SHUF_DX $0x4e, xvec7, xvec4; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; + +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; + +#### Unroll time 3 #### +LD_DX 16*SIZE(ptrba), xvec0; +LD_DX 4*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 18*SIZE(ptrba), xvec1; +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +LD_DX 20*SIZE(ptrba), xvec2; +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; + +LD_DX 22*SIZE(ptrba), xvec3; +SHUF_DX $0x4e, xvec7, xvec4; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; + +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; + +#### Unroll time 4 #### +LD_DX 24*SIZE(ptrba), xvec0; +LD_DX 6*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $8*SIZE, ptrbb; + +LD_DX 26*SIZE(ptrba), xvec1; +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +LD_DX 28*SIZE(ptrba), xvec2; +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; + +LD_DX 30*SIZE(ptrba), xvec3; +SHUF_DX $0x4e, xvec7, xvec4; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; +ADDQ $32*SIZE, ptrba; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; + +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; +DECQ k; +JG .L211_bodyB; +ALIGN_5 +.L211_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L212_loopE; +ALIGN_5; +.L212_bodyB: +# Computing kernel +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec1; +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +LD_DX 4*SIZE(ptrba), xvec2; +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; + +LD_DX 6*SIZE(ptrba), xvec3; +SHUF_DX $0x4e, xvec7, xvec4; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; + +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; + +#### Unroll time 2 #### +LD_DX 8*SIZE(ptrba), xvec0; +LD_DX 2*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $4*SIZE, ptrbb; + +LD_DX 10*SIZE(ptrba), xvec1; +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +LD_DX 12*SIZE(ptrba), xvec2; +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; + +LD_DX 14*SIZE(ptrba), xvec3; +SHUF_DX $0x4e, xvec7, xvec4; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; +ADDQ $16*SIZE, ptrba; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; + +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; + +.L212_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L213_loopE; +ALIGN_5 +.L213_bodyB: +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $2*SIZE, ptrbb; + +LD_DX 2*SIZE(ptrba), xvec1; +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +LD_DX 4*SIZE(ptrba), xvec2; +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; + +LD_DX 6*SIZE(ptrba), xvec3; +SHUF_DX $0x4e, xvec7, xvec4; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; +ADDQ $8*SIZE, ptrba; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MOV_DX xvec5, xvec6; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +MOV_DX xvec6, xvec7; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; + +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; + +.L213_loopE: +#### Multiply Alpha #### +BROAD_DX MEMALPHA, xvec7; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec12, xvec12; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; +MUL_DX xvec7, xvec9, xvec9; +MUL_DX xvec7, xvec8, xvec8; +#### Reverse ##### +MOV_DX xvec15, xvec6; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; +MOV_DX xvec14, xvec6; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; +MOV_DX xvec13, xvec6; +REVS_DX xvec9, xvec13, xvec13; +REVS_DX xvec6, xvec9, xvec9; +MOV_DX xvec12, xvec6; +REVS_DX xvec8, xvec12, xvec12; +REVS_DX xvec6, xvec8, xvec8; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L213_loopEx; +ALIGN_5 +#### Writing Back #### +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 4*SIZE(C0), xvec9, xvec9; +ADD_DX 6*SIZE(C0), xvec8, xvec8; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; +ADD_DX 4*SIZE(C1), xvec13, xvec13; +ADD_DX 6*SIZE(C1), xvec12, xvec12; +#endif +ST_DX xvec11, 0*SIZE(C0); +ST_DX xvec10, 2*SIZE(C0); +ST_DX xvec9, 4*SIZE(C0); +ST_DX xvec8, 6*SIZE(C0); +ST_DX xvec15, 0*SIZE(C1); +ST_DX xvec14, 2*SIZE(C1); +ST_DX xvec13, 4*SIZE(C1); +ST_DX xvec12, 6*SIZE(C1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L21_bodyB; +JMP .L21_loopE; +ALIGN_5 +.L213_loopEx:; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; +ADD_DX xvec2, xvec9, xvec9; +ADD_DX xvec3, xvec8, xvec8; +#endif +STL_DX xvec11, 0*SIZE(C0); +STH_DX xvec11, 1*SIZE(C0); +STL_DX xvec10, 2*SIZE(C0); +STH_DX xvec10, 3*SIZE(C0); +STL_DX xvec9, 4*SIZE(C0); +STH_DX xvec9, 5*SIZE(C0); +STL_DX xvec8, 6*SIZE(C0); +STH_DX xvec8, 7*SIZE(C0); +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +LDL_DX 4*SIZE(C1), xvec6, xvec6; +LDH_DX 5*SIZE(C1), xvec6, xvec6; +LDL_DX 6*SIZE(C1), xvec7, xvec7; +LDH_DX 7*SIZE(C1), xvec7, xvec7; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; +ADD_DX xvec6, xvec13, xvec13; +ADD_DX xvec7, xvec12, xvec12; +#endif +STL_DX xvec15, 0*SIZE(C1); +STH_DX xvec15, 1*SIZE(C1); +STL_DX xvec14, 2*SIZE(C1); +STH_DX xvec14, 3*SIZE(C1); +STL_DX xvec13, 4*SIZE(C1); +STH_DX xvec13, 5*SIZE(C1); +STL_DX xvec12, 6*SIZE(C1); +STH_DX xvec12, 7*SIZE(C1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L21_bodyB; +.L21_loopE:; +TEST $4, bm; # Rm = 4 +JLE .L22_loopE; +ALIGN_5; +.L22_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#### Initial Results Register #### +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +XOR_DY yvec11, yvec11, yvec11; +XOR_DY yvec10, yvec10, yvec10; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L221_loopE; +ALIGN_5 +.L221_bodyB:; +# Computing kernel +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec1; +SHUF_DX $0x4e, xvec5, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +#### Unroll time 2 #### +LD_DX 4*SIZE(ptrba), xvec0; +LD_DX 2*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 6*SIZE(ptrba), xvec1; +SHUF_DX $0x4e, xvec5, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +#### Unroll time 3 #### +LD_DX 8*SIZE(ptrba), xvec0; +LD_DX 4*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 10*SIZE(ptrba), xvec1; +SHUF_DX $0x4e, xvec5, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +#### Unroll time 4 #### +LD_DX 12*SIZE(ptrba), xvec0; +LD_DX 6*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $8*SIZE, ptrbb; + +LD_DX 14*SIZE(ptrba), xvec1; +SHUF_DX $0x4e, xvec5, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; +ADDQ $16*SIZE, ptrba; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; +DECQ k; +JG .L221_bodyB; +ALIGN_5 +.L221_loopE:; +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L222_loopE; +ALIGN_5 +.L222_bodyB: +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec1; +SHUF_DX $0x4e, xvec5, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +#### Unroll time 2 #### +LD_DX 4*SIZE(ptrba), xvec0; +LD_DX 2*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $4*SIZE, ptrbb; + +LD_DX 6*SIZE(ptrba), xvec1; +SHUF_DX $0x4e, xvec5, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; +ADDQ $8*SIZE, ptrba; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +.L222_loopE: +#ifndef TRMMKERNEL +TEST $1, bk +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L223_loopE; +ALIGN_5 +.L223_bodyB: +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $2*SIZE, ptrbb; + +LD_DX 2*SIZE(ptrba), xvec1; +SHUF_DX $0x4e, xvec5, xvec4; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; +ADDQ $4*SIZE, ptrba; + +MOV_DX xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; + +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; + +.L223_loopE: +#### Multiply Alpha #### +BROAD_DX MEMALPHA, xvec7; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; +#### Reverse ##### +MOV_DX xvec15, xvec6; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; +MOV_DX xvec14, xvec6; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L223_loopEx; +ALIGN_5 +#### Writing Back #### +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; +#endif +ST_DX xvec11, 0*SIZE(C0); +ST_DX xvec10, 2*SIZE(C0); +ST_DX xvec15, 0*SIZE(C1); +ST_DX xvec14, 2*SIZE(C1); +#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +JMP .L22_loopE; +ALIGN_5 +.L223_loopEx:; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; +#endif +STL_DX xvec11, 0*SIZE(C0); +STH_DX xvec11, 1*SIZE(C0); +STL_DX xvec10, 2*SIZE(C0); +STH_DX xvec10, 3*SIZE(C0); +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; +#endif +STL_DX xvec15, 0*SIZE(C1); +STH_DX xvec15, 1*SIZE(C1); +STL_DX xvec14, 2*SIZE(C1); +STH_DX xvec14, 3*SIZE(C1); +#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +.L22_loopE:; +TEST $2, bm; # Rm = 2 +JLE .L23_loopE; +ALIGN_5; +.L23_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec11, yvec11, yvec11; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L231_loopE; +ALIGN_5 +.L231_bodyB: +# Computing kernel +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +SHUF_DX $0x4e, xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; +#### Unroll time 2 #### +LD_DX 2*SIZE(ptrba), xvec0; +LD_DX 2*SIZE(ptrbb), xvec4; +SHUF_DX $0x4e, xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; +#### Unroll time 3 #### +LD_DX 4*SIZE(ptrba), xvec0; +LD_DX 4*SIZE(ptrbb), xvec4; +SHUF_DX $0x4e, xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; +#### Unroll time 4 #### +LD_DX 6*SIZE(ptrba), xvec0; +LD_DX 6*SIZE(ptrbb), xvec4; +SHUF_DX $0x4e, xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $8*SIZE, ptrba; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L231_bodyB; +ALIGN_5 +.L231_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L232_loopE; +ALIGN_5 +.L232_bodyB: +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +SHUF_DX $0x4e, xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; + +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; +#### Unroll time 2 #### +LD_DX 2*SIZE(ptrba), xvec0; +LD_DX 2*SIZE(ptrbb), xvec4; +SHUF_DX $0x4e, xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; +ADDQ $4*SIZE, ptrbb; +.L232_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L233_loopE; +ALIGN_5 +.L233_bodyB: +#### Unroll time 1 #### +LD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec4; +SHUF_DX $0x4e, xvec4, xvec5; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +ADDQ $2*SIZE, ptrba; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; +ADDQ $2*SIZE, ptrbb; +.L233_loopE: +#### Multiply Alpha #### +BROAD_DX MEMALPHA, xvec7; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec11, xvec11; +#### Reverse ##### +MOV_DX xvec15, xvec6; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L233_loopEx; +ALIGN_5 +#### Writing Back #### +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +#endif +ST_DX xvec11, 0*SIZE(C0); +ST_DX xvec15, 0*SIZE(C1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +JMP .L23_loopE; +ALIGN_5 +.L233_loopEx:; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec11, xvec11; +#endif +STL_DX xvec11, 0*SIZE(C0); +STH_DX xvec11, 1*SIZE(C0); +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; +#endif +STL_DX xvec15, 0*SIZE(C1); +STH_DX xvec15, 1*SIZE(C1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +.L23_loopE: +TEST $1, bm; # Rm = 1 +JLE .L24_loopE; +ALIGN_5; +.L24_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L241_loopE; +ALIGN_5 +.L241_bodyB: +BROAD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +BROAD_DX 1*SIZE(ptrba), xvec1; +LD_DX 2*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; + +BROAD_DX 2*SIZE(ptrba), xvec0; +LD_DX 4*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +BROAD_DX 3*SIZE(ptrba), xvec1; +LD_DX 6*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L241_bodyB; +ALIGN_5 +.L241_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L242_loopE; +ALIGN_5 +.L242_bodyB: +BROAD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +BROAD_DX 1*SIZE(ptrba), xvec1; +LD_DX 2*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +.L242_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L243_loopE; +ALIGN_5 +.L243_bodyB: +BROAD_DX 0*SIZE(ptrba), xvec0; +LD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; +ADDQ $1*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L243_loopE: +BROAD_DX MEMALPHA, xvec7; +MUL_DX xvec7, xvec15, xvec15; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C1), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 0*SIZE(C1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif +ADDQ $1*SIZE, C0; +ADDQ $1*SIZE, C1; +.L24_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $2, kk; +#endif +MOVQ bk, k; +SALQ $4, k; +ADDQ k, bb; +LEAQ (C, ldc, 2), C; +.L20_loopE:; +TEST $1, bn; # Rn = 1 +JLE .L30_loopE; +ALIGN_5 +.L30_bodyB: +#if defined(TRMMKERNEL)&&defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ C, C0; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $3, i; +JLE .L31_loopE; +ALIGN_5 +.L31_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +ADDQ %rax, ptrbb; +#endif +#### Initial Results Register #### +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L311_loopE; +ALIGN_5 +.L311_bodyB: +#### Unroll time 1 #### +LD_DY 0*SIZE(ptrba), yvec0; +LD_DY 4*SIZE(ptrba), yvec1; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec2, yvec0, yvec0; +ADD_DY yvec0, yvec15, yvec15; +MUL_DY yvec2, yvec1, yvec1; +ADD_DY yvec1, yvec14, yvec14; + +#### Unroll time 2 #### +LD_DY 8*SIZE(ptrba), yvec3; +LD_DY 12*SIZE(ptrba), yvec4; +BROAD_DY 1*SIZE(ptrbb), yvec5; +MUL_DY yvec5, yvec3, yvec3; +ADD_DY yvec3, yvec15, yvec15; +MUL_DY yvec5, yvec4, yvec4 +ADD_DY yvec4, yvec14, yvec14; + +#### Unroll time 3 #### +LD_DY 16*SIZE(ptrba), yvec0; +LD_DY 20*SIZE(ptrba), yvec1; +BROAD_DY 2*SIZE(ptrbb), yvec2; +MUL_DY yvec2, yvec0, yvec0; +ADD_DY yvec0, yvec15, yvec15; +MUL_DY yvec2, yvec1, yvec1; +ADD_DY yvec1, yvec14, yvec14; + +#### Unroll time 2 #### +LD_DY 24*SIZE(ptrba), yvec3; +LD_DY 28*SIZE(ptrba), yvec4; +BROAD_DY 3*SIZE(ptrbb), yvec5; +MUL_DY yvec5, yvec3, yvec3; +ADD_DY yvec3, yvec15, yvec15; +ADDQ $32*SIZE, ptrba; +MUL_DY yvec5, yvec4, yvec4; +ADD_DY yvec4, yvec14, yvec14; +ADDQ $4*SIZE, ptrbb; +DECQ k; +JG .L311_bodyB; +ALIGN_5 +.L311_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L312_loopE; +ALIGN_5 +.L312_bodyB: +#### Unroll time 1 #### +LD_DY 0*SIZE(ptrba), yvec0; +LD_DY 4*SIZE(ptrba), yvec1; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec2, yvec0, yvec0; +ADD_DY yvec0, yvec15, yvec15; +MUL_DY yvec2, yvec1, yvec1; +ADD_DY yvec1, yvec14, yvec14; + +#### Unroll time 2 #### +LD_DY 8*SIZE(ptrba), yvec3; +LD_DY 12*SIZE(ptrba), yvec4; +BROAD_DY 1*SIZE(ptrbb), yvec5; +MUL_DY yvec5, yvec3, yvec3; +ADD_DY yvec3, yvec15, yvec15; +ADDQ $16*SIZE, ptrba; +MUL_DY yvec5, yvec4, yvec4 +ADD_DY yvec4, yvec14, yvec14; +ADDQ $2*SIZE, ptrbb; + +.L312_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L313_loopE; +ALIGN_5 +.L313_bodyB: +#### Unroll time 1 #### +LD_DY 0*SIZE(ptrba), yvec0; +LD_DY 4*SIZE(ptrba), yvec1; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec2, yvec0, yvec0; +ADD_DY yvec0, yvec15, yvec15; +ADDQ $8*SIZE, ptrba; +MUL_DY yvec2, yvec1, yvec1; +ADD_DY yvec1, yvec14, yvec14; +ADDQ $1*SIZE, ptrbb; + +.L313_loopE: +#### Multiply Alpha #### +BROAD_DY MEMALPHA, yvec7; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec7, yvec14, yvec14; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L313_loopEx; +ALIGN_5 +#### Writing Back #### +EXTRA_DY $1, yvec15, xvec13; +EXTRA_DY $1, yvec14, xvec12; +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec13, xvec13; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C0), xvec12, xvec12; +#endif +ST_DX xvec15, 0*SIZE(C0); +ST_DX xvec13, 2*SIZE(C0); +ST_DX xvec14, 4*SIZE(C0); +ST_DX xvec12, 6*SIZE(C0); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $8, kk; +#endif +ADDQ $8*SIZE, C0; +DECQ i; +JG .L31_bodyB; +JMP .L31_loopE; +ALIGN_5 +.L313_loopEx: +EXTRA_DY $1, yvec15, xvec13; +EXTRA_DY $1, yvec14, xvec12; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec11, xvec11; +LDH_DX 1*SIZE(C0), xvec11, xvec11; +LDL_DX 2*SIZE(C0), xvec10, xvec10; +LDH_DX 3*SIZE(C0), xvec10, xvec10; +LDL_DX 4*SIZE(C0), xvec9, xvec9; +LDH_DX 5*SIZE(C0), xvec9, xvec9; +LDL_DX 6*SIZE(C0), xvec8, xvec8; +LDH_DX 7*SIZE(C0), xvec8, xvec8; +ADD_DX xvec11, xvec15, xvec15; +ADD_DX xvec10, xvec13, xvec13; +ADD_DX xvec9, xvec14, xvec14; +ADD_DX xvec8, xvec12, xvec12; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec13, 2*SIZE(C0); +STH_DX xvec13, 3*SIZE(C0); +STL_DX xvec14, 4*SIZE(C0); +STH_DX xvec14, 5*SIZE(C0); +STL_DX xvec12, 6*SIZE(C0); +STH_DX xvec12, 7*SIZE(C0); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $8, kk; +#endif +ADDQ $8*SIZE, C0; +DECQ i; +JG .L31_bodyB; +.L31_loopE: +TEST $4, bm +JLE .L32_loopE; +ALIGN_5 +.L32_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +#### Initial Results Register #### +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk +#endif +SARQ $2, k; +JLE .L321_loopE; +ALIGN_5 +.L321_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec1; +MUL_DY yvec0, yvec1, yvec1; +ADD_DY yvec1, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec2; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec2, yvec3, yvec3; +ADD_DY yvec3, yvec15, yvec15; + +LD_DY 8*SIZE(ptrba), yvec4; +BROAD_DY 2*SIZE(ptrbb), yvec5; +MUL_DY yvec4, yvec5, yvec5; +ADD_DY yvec5, yvec15, yvec15; + +LD_DY 12*SIZE(ptrba), yvec6; +BROAD_DY 3*SIZE(ptrbb), yvec7; +MUL_DY yvec6, yvec7, yvec7; +ADD_DY yvec7, yvec15, yvec15; +ADDQ $16*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +DECQ k; +JG .L321_bodyB; +ALIGN_5 +.L321_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L322_loopE; +ALIGN_5 +.L322_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec1; +MUL_DY yvec0, yvec1, yvec1; +ADD_DY yvec1, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec2; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec2, yvec3, yvec3; +ADD_DY yvec3, yvec15, yvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L322_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L323_loopE; +ALIGN_5 +.L323_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec1; +MUL_DY yvec0, yvec1, yvec1; +ADD_DY yvec1, yvec15, yvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $1*SIZE, ptrbb; + +.L323_loopE: +#### Multiply Alpha #### +BROAD_DY MEMALPHA, yvec7; +MUL_DY yvec7, yvec15, yvec15; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L323_loopEx; +ALIGN_5 +#### Writing Back #### +EXTRA_DY $1, yvec15, xvec14; +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec14, xvec14; +#endif +ST_DX xvec15, 0*SIZE(C0); +ST_DX xvec14, 2*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +JMP .L32_loopE; +ALIGN_5 +.L323_loopEx: +#### Writing Back #### +EXTRA_DY $1, yvec15, xvec14; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec13, xvec13; +LDH_DX 1*SIZE(C0), xvec13, xvec13; +LDL_DX 2*SIZE(C0), xvec12, xvec12; +LDH_DX 3*SIZE(C0), xvec12, xvec12; +ADD_DX xvec13, xvec15, xvec15; +ADD_DX xvec12, xvec14, xvec14; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec14, 2*SIZE(C0); +STH_DX xvec14, 3*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +.L32_loopE: +TEST $2, bm +JLE .L33_loopE; +ALIGN_5 +.L33_bodyB: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax +LEAQ (, %rax, SIZE), %rax +LEAQ (ptrba, %rax, 2), ptrba +ADDQ %rax, ptrbb; +#endif +#### Initial Result #### +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L331_loopE; +ALIGN_5 +.L331_bodyB: +LD_DX 0*SIZE(ptrba), xvec0; +BROAD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec1; +BROAD_DX 1*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; + +LD_DX 4*SIZE(ptrba), xvec4; +BROAD_DX 2*SIZE(ptrbb), xvec5; +MUL_DX xvec4, xvec5, xvec5; +ADD_DX xvec5, xvec15, xvec15; + +LD_DX 6*SIZE(ptrba), xvec6; +BROAD_DX 3*SIZE(ptrbb), xvec7; +MUL_DX xvec6, xvec7, xvec7; +ADD_DX xvec7, xvec15, xvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +DECQ k; +JG .L331_bodyB; +ALIGN_5 +.L331_loopE: +#ifndef TRMMKERNEL +TEST $2,bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax +#endif +JLE .L332_loopE; +ALIGN_5 +.L332_bodyB: +LD_DX 0*SIZE(ptrba), xvec0; +BROAD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec1; +BROAD_DX 1*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; +.L332_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L333_loopE; +ALIGN_5 +.L333_bodyB: +LD_DX 0*SIZE(ptrba), xvec0; +BROAD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $1*SIZE, ptrbb; +.L333_loopE: +#### Multiply Alpha #### +BROAD_DX MEMALPHA, xvec7; +MUL_DX xvec7, xvec15, xvec15; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +ADD_DX xvec14, xvec15, xvec15; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +addq $2, kk +#endif +ADDQ $2*SIZE, C0; +.L33_loopE: +TEST $1, bm +JLE .L34_loopE; +ALIGN_5 +.L34_bodyB: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L341_loopE; +ALIGN_5 +.L341_bodyB: +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 2*SIZE(ptrba), xvec0; +vmovsd 2*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 3*SIZE(ptrba), xvec0; +vmovsd 3*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; +addq $4*SIZE, ptrba; +addq $4*SIZE, ptrbb; +decq k; +JG .L341_bodyB; +ALIGN_5 +.L341_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +MOVQ kkk, %rax; +TEST $2, %rax; +#endif +JLE .L342_loopE; +ALIGN_5 +.L342_bodyB: +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; +addq $2*SIZE, ptrba; +addq $2*SIZE, ptrbb; + +.L342_loopE: +#ifndef TRMMKERNEL +TEST $1, bk +#else +MOVQ kkk, %rax; +TEST $1, %rax; +#endif +JLE .L343_loopE; +ALIGN_5 +.L343_bodyB: +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; +addq $1*SIZE, ptrba; +addq $1*SIZE, ptrbb; + +.L343_loopE: +#### Writing Back #### +vmovsd MEMALPHA, xvec7; +vmulsd xvec7, xvec15, xvec15; +#ifndef TRMMKERNEL +vmovsd 0*SIZE(C0), xvec0; +vaddsd xvec0, xvec15, xvec15; +#endif +movsd xvec15, 0*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +addq $1, kk +#endif +addq $1*SIZE, C0; +.L34_loopE: +MOVQ bk, k +SALQ $3, k; +ADDQ k, bb; +LEAQ (C, ldc, 1), C; + +.L30_loopE: +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; + +vzeroupper + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S index 2724cfe92..e4bde49bd 100644 --- a/kernel/x86_64/dgemm_ncopy_2.S +++ b/kernel/x86_64/dgemm_ncopy_2.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S index 52115bd4d..1e4431664 100644 --- a/kernel/x86_64/dgemm_ncopy_4.S +++ b/kernel/x86_64/dgemm_ncopy_4.S @@ -45,7 +45,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S index 5d3627230..f35c3c5af 100644 --- a/kernel/x86_64/dgemm_ncopy_8.S +++ b/kernel/x86_64/dgemm_ncopy_8.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S index 06e59991d..b0b3590aa 100644 --- a/kernel/x86_64/dgemm_tcopy_2.S +++ b/kernel/x86_64/dgemm_tcopy_2.S @@ -52,6 +52,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S index 8b81c41c0..85b0253d7 100644 --- a/kernel/x86_64/dgemm_tcopy_4.S +++ b/kernel/x86_64/dgemm_tcopy_4.S @@ -51,6 +51,12 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S index 976033714..3d411cda5 100644 --- a/kernel/x86_64/dgemm_tcopy_8.S +++ b/kernel/x86_64/dgemm_tcopy_8.S @@ -46,6 +46,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S index 071920723..48b3f17c4 100644 --- a/kernel/x86_64/dgemv_t.S +++ b/kernel/x86_64/dgemv_t.S @@ -47,7 +47,7 @@ #ifndef WINDOWS_ABI -#define STACKSIZE 64 +#define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi @@ -57,7 +57,10 @@ #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) - +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) #else #define STACKSIZE 256 @@ -71,6 +74,11 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) +//Temp variables for M,N,A,LDA +#define MMM 224(%rsp) +#define NN 232(%rsp) +#define AA 240(%rsp) +#define LDAX 248(%rsp) #endif @@ -131,28 +139,18 @@ movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X + + movq M, MMM + movq N, NN + movq A, AA + movq LDA, LDAX + #else - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX #endif - - movq STACK_INCX, INCX - movq STACK_Y, Y - movq STACK_INCY, INCY - movq STACK_BUFFER, BUFFER - - leaq -1(INCX), %rax - - leaq (,LDA, SIZE), LDA - leaq (,INCX, SIZE), INCX - leaq (,INCY, SIZE), INCY - - leaq (LDA, LDA, 2), LDA3 - - subq $-16 * SIZE, A - #ifdef HAVE_SSE3 #ifndef WINDOWS_ABI movddup %xmm0, ALPHA @@ -168,6 +166,39 @@ unpcklpd ALPHA, ALPHA #endif + + +.L0x: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + + leaq -1(INCX), %rax + + leaq (,LDA, SIZE), LDA + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + testq M, M jle .L999 testq N, N @@ -854,7 +885,6 @@ .L21: #endif - subq $4, N leaq 16 * SIZE(BUFFER), X1 @@ -2461,6 +2491,12 @@ ALIGN_4 .L999: + leaq (, M, SIZE), %rax + addq %rax,AA + jmp .L0x; + ALIGN_4 + +.L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S index 61c481064..985ce9fec 100644 --- a/kernel/x86_64/dot_sse.S +++ b/kernel/x86_64/dot_sse.S @@ -530,7 +530,7 @@ #endif movsd -32 * SIZE(Y), %xmm8 - pshufd $0x39, %xmm4, %xmm5 + pshufd $0x29, %xmm4, %xmm5 mulps %xmm8, %xmm5 addps %xmm5, %xmm3 @@ -750,7 +750,8 @@ xorps %xmm5, %xmm5 movhlps %xmm4, %xmm5 - mulps -32 * SIZE(Y), %xmm5 + movlps -32 * SIZE(Y), %xmm4 + mulps %xmm4, %xmm5 addps %xmm5, %xmm0 addq $2 * SIZE, X @@ -992,7 +993,7 @@ movsd -32 * SIZE(Y), %xmm8 movss %xmm5, %xmm4 - shufps $0x93, %xmm5, %xmm4 + shufps $0x93, %xmm4, %xmm4 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S index b40c8bac7..becd19544 100644 --- a/kernel/x86_64/gemm_kernel_8x4_barcelona.S +++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S @@ -930,7 +930,7 @@ .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 @@ -983,7 +983,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 @@ -1178,7 +1178,7 @@ .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -1423,7 +1423,7 @@ .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -1765,7 +1765,7 @@ .L62: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 @@ -1793,7 +1793,7 @@ addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -1822,7 +1822,7 @@ addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 @@ -1851,7 +1851,7 @@ addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 @@ -2024,7 +2024,7 @@ .L72: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2208,7 +2208,7 @@ .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -2395,7 +2395,7 @@ .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -2670,7 +2670,7 @@ .L112: mulps %xmm9, %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2687,7 +2687,7 @@ addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 @@ -2704,7 +2704,7 @@ addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 @@ -2721,7 +2721,7 @@ addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 @@ -2857,7 +2857,7 @@ .L122: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps -28 * SIZE(AO), %xmm8 @@ -2873,7 +2873,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -3003,7 +3003,7 @@ .L132: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd -30 * SIZE(AO), %xmm8 @@ -3150,7 +3150,7 @@ .L142: mulss %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss -31 * SIZE(AO), %xmm8 diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S index 72c2b9d20..06a0feae9 100644 --- a/kernel/x86_64/gemm_ncopy_2.S +++ b/kernel/x86_64/gemm_ncopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S index a04542f6a..cac647fa0 100644 --- a/kernel/x86_64/gemm_ncopy_4.S +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S index edde7e2c1..e5cbd62eb 100644 --- a/kernel/x86_64/gemm_ncopy_4_opteron.S +++ b/kernel/x86_64/gemm_ncopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (48 + 4) #define MOVNTQ MOVQ @@ -79,7 +79,7 @@ #define AO3 %r13 #define AO4 %rax -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S index 8bfaca265..190cebb29 100644 --- a/kernel/x86_64/gemm_tcopy_2.S +++ b/kernel/x86_64/gemm_tcopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S index 877969ff5..c2308162f 100644 --- a/kernel/x86_64/gemm_tcopy_4.S +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S index 459eeb8c5..105fe3b47 100644 --- a/kernel/x86_64/gemm_tcopy_4_opteron.S +++ b/kernel/x86_64/gemm_tcopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4) #define MOVNTQ MOVQ @@ -96,7 +96,7 @@ #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S index 4e66e5338..404608256 100644 --- a/kernel/x86_64/izamax_sse2.S +++ b/kernel/x86_64/izamax_sse2.S @@ -469,7 +469,7 @@ ALIGN_4 .L71: -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) prefetch PREFETCHSIZE * SIZE(X) #endif diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S index 323e8b9dd..9c8dd9dc2 100644 --- a/kernel/x86_64/scal_sse.S +++ b/kernel/x86_64/scal_sse.S @@ -266,7 +266,7 @@ sarq $5, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S index b0abb4533..3823b1fc9 100644 --- a/kernel/x86_64/scal_sse2.S +++ b/kernel/x86_64/scal_sse2.S @@ -74,7 +74,8 @@ xorps %xmm1, %xmm1 comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO - + jp .L100 # For Alpha = NaN + /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 @@ -250,7 +251,7 @@ sarq $4, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S new file mode 100644 index 000000000..20ddcaa8e --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -0,0 +1,3723 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define old_bm %rdi +#define old_bn %rsi +#define old_bk %rdx + +#define bm %r13 +#define bn %r14 +#define bk %r15 + +#define ALPHA %xmm0 +#define ba %rcx +#define bb %r8 +#define C %r9 +#define ldc %r10 + +#define i %r11 +#define k %rax + +#define ptrba %rdi +#define ptrbb %rsi +#define C0 %rbx +#define C1 %rbp + +#define prebb %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define old_ldc 8+STACKSIZE(%rsp) +#define old_offset 16+STACKSIZE(%rsp) +#define MEMALPHA 48(%rsp) +#define j 56(%rsp) +#define OFFSET 64(%rsp) +#define kk 72(%rsp) +#define kkk 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define old_ldc 64 + STACKSIZE(%rsp) +#define old_offset 72 + STACKSIZE(%rsp) + +#define MEMALPHA 224(%rsp) +#define j 232(%rsp) +#define OFFSET 240(%rsp) +#define kk 248(%rsp) +#define kkk 256(%rsp) + +#endif + +#define PREFETCH0 prefetcht0 +#define PREFETCH1 prefetcht0 +#define PREFETCH2 prefetcht2 +#define PRESIZE 80 + +#define xvec0 %xmm0 +#define xvec1 %xmm1 +#define xvec2 %xmm2 +#define xvec3 %xmm3 +#define xvec4 %xmm4 +#define xvec5 %xmm5 +#define xvec6 %xmm6 +#define xvec7 %xmm7 +#define xvec8 %xmm8 +#define xvec9 %xmm9 +#define xvec10 %xmm10 +#define xvec11 %xmm11 +#define xvec12 %xmm12 +#define xvec13 %xmm13 +#define xvec14 %xmm14 +#define xvec15 %xmm15 + +#define yvec0 %ymm0 +#define yvec1 %ymm1 +#define yvec2 %ymm2 +#define yvec3 %ymm3 +#define yvec4 %ymm4 +#define yvec5 %ymm5 +#define yvec6 %ymm6 +#define yvec7 %ymm7 +#define yvec8 %ymm8 +#define yvec9 %ymm9 +#define yvec10 %ymm10 +#define yvec11 %ymm11 +#define yvec12 %ymm12 +#define yvec13 %ymm13 +#define yvec14 %ymm14 +#define yvec15 %ymm15 + +#define LEAQ leaq +#define ADDQ addq +#define MULQ imulq +#define SARQ sarq +#define SALQ salq +#define ANDQ andq +#define SUBQ subq +#define DECQ decq +#define JG jg +#define JLE jle +#define TEST testq +#define OR orq +#define JNE jne +#define JMP jmp +#define NOP +#define XOR xorpd +#undef MOVQ +#define MOVQ movq + +#define XOR_SY vxorps +#define XOR_SX vxorps + +#define LD_SY vmovaps +#define LD_SX vmovaps +#define LDL_SX vmovlps +#define LDL_SY vmovlps +#define LDH_SX vmovhps +#define LDH_SY vmovhps + +#define ST_SY vmovaps +#define ST_SX vmovaps +#define STL_SX vmovlps +#define STL_SY vmovlps +#define STH_SX vmovhps +#define STH_SY vmovhps + +#define EDUP_SY vmovsldup +#define ODUP_SY vmovshdup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup + +#define ADD_SY vaddps +#define ADD_SX vaddps + +#define ADD1_DY vaddpd +#define ADDSUB_SY vaddsubps + +#define MUL_SY vmulps +#define MUL_SX vmulps + +#define SHUF_SY vperm2f128 +#define SHUF_DY vperm2f128 +#define SHUF_SX vpshufd + +#define VPERMILP_SY vpermilps +#define VPERMILP_SX vpermilps + +#define BROAD_SY vbroadcastss +#define BROAD_SX vbroadcastss + +#define MOV_SY vmovaps +#define MOV_SX vmovaps + +#define REVS_SY vshufps +#define REVS_SX vshufps + +#define EXTRA_SY vextractf128 + + +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, old_bm + movq ARG2, old_bn + movq ARG3, old_bk + movq OLD_A, ba + movq OLD_B, bb + movq OLD_C, C + movq old_ldc, ldc +#ifdef TRMMKERNEL + movq old_offset, %r11 +#endif + movaps %xmm3, %xmm0 +#else + +movq old_ldc, ldc +#ifdef TRMMKERNEL +movq old_offset, %r11 +#endif +#endif + +vzeroupper + +vmovlps ALPHA, MEMALPHA +movq old_bm, bm +movq old_bn, bn +movq old_bk, bk +leaq (, ldc, SIZE), ldc +#ifdef TRMMKERNEL +movq %r11, OFFSET +#ifndef LEFT +negq %r11; +#endif +movq %r11, kk +#endif +MOVQ bn,j; +SARQ $3,j; +JLE .L0_loopE; +ALIGN_4; +.L0_bodyB:; +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif + +MOVQ C,C0; +LEAQ (C,ldc,4),C1; +MOVQ bk, k; +SALQ $5, k; +LEAQ (bb, k, 1), prebb; +MOVQ ba,ptrba; +MOVQ bm,i; +SARQ $3,i; +JLE .L1_loopE; +ALIGN_4; +.L1_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#### Initial Results Register #### +XOR_SY yvec15, yvec15, yvec15; +PREFETCH0 0*SIZE(prebb); +XOR_SY yvec14, yvec14, yvec14; +PREFETCH0 16*SIZE(prebb); +XOR_SY yvec13, yvec13, yvec13; +PREFETCH0 32*SIZE(prebb); +XOR_SY yvec12, yvec12, yvec12; +ADDQ $48*SIZE, prebb; +EDUP_SY 0*SIZE(ptrbb), yvec2; +LEAQ (ldc, ldc, 2), %rax; +PREFETCH2 7*SIZE(C0); +PREFETCH2 7*SIZE(C1); +XOR_SY yvec11, yvec11, yvec11; +XOR_SY yvec10, yvec10, yvec10; +PREFETCH2 7*SIZE(C0, ldc, 1); +PREFETCH2 7*SIZE(C1, ldc, 1); +LD_SY 0*SIZE(ptrba), yvec0; +XOR_SY yvec9, yvec9, yvec9; +PREFETCH2 7*SIZE(C0, ldc, 2); +PREFETCH2 7*SIZE(C1, ldc, 2); +XOR_SY yvec8, yvec8, yvec8; +VPERMILP_SY $0x4e, yvec2, yvec3; +PREFETCH2 7*SIZE(C0, %rax, 1); +PREFETCH2 7*SIZE(C1, %rax, 1); +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $8, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2,k; +JLE .L2_loopE; +ALIGN_4; +.L2_bodyB:; +# Computing kernel + +#### Unroll times 1 #### +PREFETCH0 PRESIZE*SIZE(ptrba); +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +ODUP_SY 0*SIZE(ptrbb), yvec2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5 +ADD_SY yvec15, yvec6, yvec15 +ADD_SY yvec13, yvec7, yvec13; + +LD_SY 8*SIZE(ptrba), yvec1; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADD_SY yvec11, yvec6, yvec11; +ADD_SY yvec9, yvec7, yvec9; + +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +EDUP_SY 8*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +ADD_SY yvec14, yvec6, yvec14; +ADD_SY yvec12, yvec7, yvec12; + +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADD_SY yvec10, yvec6, yvec10; +ADD_SY yvec8, yvec7, yvec8; + +#### Unroll times 2 #### +MUL_SY yvec1, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +ODUP_SY 8*SIZE(ptrbb), yvec2 +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5 +ADD_SY yvec15, yvec6, yvec15 +ADD_SY yvec13, yvec7, yvec13; + +LD_SY 16*SIZE(ptrba), yvec0; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD_SY yvec11, yvec6, yvec11; +ADD_SY yvec9, yvec7, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +EDUP_SY 16*SIZE(ptrbb), yvec2; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +ADD_SY yvec14, yvec6, yvec14; +ADD_SY yvec12, yvec7, yvec12; + +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD_SY yvec10, yvec6, yvec10; +ADD_SY yvec8, yvec7, yvec8; + +#### Unroll times 3 #### +PREFETCH0 (PRESIZE+16)*SIZE(ptrba); +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +ODUP_SY 16*SIZE(ptrbb), yvec2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5 +ADD_SY yvec15, yvec6, yvec15 +ADD_SY yvec13, yvec7, yvec13; + +LD_SY 24*SIZE(ptrba), yvec1; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADD_SY yvec11, yvec6, yvec11; +ADD_SY yvec9, yvec7, yvec9; +ADDQ $32*SIZE, ptrba; + +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +EDUP_SY 24*SIZE(ptrbb), yvec2; +ADD_SY yvec14, yvec6, yvec14; +ADD_SY yvec12, yvec7, yvec12; + +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADD_SY yvec10, yvec6, yvec10; +ADD_SY yvec8, yvec7, yvec8; + +#### Unroll times 4 #### +MUL_SY yvec1, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +ODUP_SY 24*SIZE(ptrbb), yvec2 +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5 +ADDQ $32*SIZE, ptrbb; +ADD_SY yvec15, yvec6, yvec15 +ADD_SY yvec13, yvec7, yvec13; + +LD_SY 0*SIZE(ptrba), yvec0; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD_SY yvec11, yvec6, yvec11; +ADD_SY yvec9, yvec7, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +EDUP_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +ADD_SY yvec14, yvec6, yvec14; +ADD_SY yvec12, yvec7, yvec12; + +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD_SY yvec10, yvec6, yvec10; +ADD_SY yvec8, yvec7, yvec8; +.L2_bodyE:; +DECQ k; +JG .L2_bodyB; +ALIGN_4 +.L2_loopE:; +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L3_loopE; +ALIGN_4 +.L3_loobB: +#### Unroll times 1 #### +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +ODUP_SY 0*SIZE(ptrbb), yvec2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5 +ADD_SY yvec15, yvec6, yvec15 +ADD_SY yvec13, yvec7, yvec13; + +LD_SY 8*SIZE(ptrba), yvec1; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADDQ $16*SIZE, ptrba; +ADD_SY yvec11, yvec6, yvec11; +ADD_SY yvec9, yvec7, yvec9; + +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +EDUP_SY 8*SIZE(ptrbb), yvec2; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +ADD_SY yvec14, yvec6, yvec14; +ADD_SY yvec12, yvec7, yvec12; + +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADD_SY yvec10, yvec6, yvec10; +ADD_SY yvec8, yvec7, yvec8; + +#### Unroll times 2 #### +MUL_SY yvec1, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +ODUP_SY 8*SIZE(ptrbb), yvec2 +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5 +ADDQ $16*SIZE, ptrbb +ADD_SY yvec15, yvec6, yvec15 +ADD_SY yvec13, yvec7, yvec13; + +LD_SY 0*SIZE(ptrba), yvec0; +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD_SY yvec11, yvec6, yvec11; +ADD_SY yvec9, yvec7, yvec9; + +MUL_SY yvec1, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +EDUP_SY 0*SIZE(ptrbb), yvec2; +MUL_SY yvec1, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +ADD_SY yvec14, yvec6, yvec14; +ADD_SY yvec12, yvec7, yvec12; + +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec1, yvec4, yvec6; +MUL_SY yvec1, yvec5, yvec7; +ADD_SY yvec10, yvec6, yvec10; +ADD_SY yvec8, yvec7, yvec8; +.L3_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L4_loopE; +ALIGN_4 +.L4_loopB:; +#### Unroll times 1 #### +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +ODUP_SY 0*SIZE(ptrbb), yvec2 +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5 +ADDQ $8*SIZE, ptrba; +ADD_SY yvec15, yvec6, yvec15 +ADD_SY yvec13, yvec7, yvec13; + +VPERMILP_SY $0x4e, yvec2, yvec3; +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADDQ $8*SIZE, ptrbb; +ADD_SY yvec11, yvec6, yvec11; +ADD_SY yvec9, yvec7, yvec9; + +MUL_SY yvec0, yvec2, yvec6; +SHUF_SY $0x03, yvec2, yvec2, yvec4; +MUL_SY yvec0, yvec3, yvec7; +SHUF_SY $0x03, yvec3, yvec3, yvec5; +ADD_SY yvec14, yvec6, yvec14; +ADD_SY yvec12, yvec7, yvec12; + +MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec5, yvec7; +ADD_SY yvec10, yvec6, yvec10; +ADD_SY yvec8, yvec7, yvec8; + +.L4_loopE:; +#### Load Alpha #### +BROAD_SY MEMALPHA,yvec7; +MUL_SY yvec7,yvec15,yvec15; +MUL_SY yvec7,yvec14,yvec14; +MUL_SY yvec7,yvec13,yvec13; +MUL_SY yvec7,yvec12,yvec12; +MUL_SY yvec7,yvec11,yvec11; +MUL_SY yvec7,yvec10,yvec10; +MUL_SY yvec7,yvec9,yvec9; +MUL_SY yvec7,yvec8,yvec8; +MOV_SY yvec15,yvec7; +REVS_SY $0xe4,yvec13,yvec15,yvec15; +REVS_SY $0xe4,yvec7,yvec13,yvec13; +MOV_SY yvec14,yvec7; +REVS_SY $0xe4,yvec12,yvec14,yvec14; +REVS_SY $0xe4,yvec7,yvec12,yvec12; +MOV_SY yvec11,yvec7; +REVS_SY $0xe4,yvec9,yvec11,yvec11; +REVS_SY $0xe4,yvec7,yvec9,yvec9; +MOV_SY yvec10,yvec7; +REVS_SY $0xe4,yvec8,yvec10,yvec10; +REVS_SY $0xe4,yvec7,yvec8,yvec8; +##### Testing alignment ##### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L4_loopEx; +ALIGN_4 +LEAQ (ldc,ldc,2),%rax; +EXTRA_SY $1,yvec15,xvec7; +EXTRA_SY $1,yvec14,xvec6; +EXTRA_SY $1,yvec13,xvec5; +EXTRA_SY $1,yvec12,xvec4; +EXTRA_SY $1,yvec11,xvec3; +EXTRA_SY $1,yvec10,xvec2; +EXTRA_SY $1,yvec9,xvec1; +EXTRA_SY $1,yvec8,xvec0; +#ifndef TRMMKERNEL +ADD_SY 0*SIZE(C0), xvec15, xvec15; +ADD_SY 4*SIZE(C1), xvec7, xvec7; +ADD_SY 0*SIZE(C0,ldc,1), xvec14, xvec14; +ADD_SY 4*SIZE(C1,ldc,1), xvec6, xvec6; +ADD_SY 0*SIZE(C0,ldc,2), xvec13, xvec13; +ADD_SY 4*SIZE(C1,ldc,2), xvec5, xvec5; +ADD_SY 0*SIZE(C0,%rax,1), xvec12, xvec12; +ADD_SY 4*SIZE(C1,%rax,1), xvec4, xvec4; +ADD_SY 0*SIZE(C1), xvec11, xvec11; +ADD_SY 4*SIZE(C0), xvec3, xvec3; +ADD_SY 0*SIZE(C1,ldc,1), xvec10, xvec10; +ADD_SY 4*SIZE(C0,ldc,1), xvec2, xvec2; +ADD_SY 0*SIZE(C1,ldc,2), xvec9, xvec9; +ADD_SY 4*SIZE(C0,ldc,2), xvec1, xvec1; +ADD_SY 0*SIZE(C1,%rax,1), xvec8, xvec8; +ADD_SY 4*SIZE(C0,%rax,1), xvec0, xvec0; +#endif +ST_SY xvec15,0*SIZE(C0); +ST_SY xvec7,4*SIZE(C1); +ST_SY xvec14,0*SIZE(C0,ldc,1); +ST_SY xvec6,4*SIZE(C1,ldc,1); +ST_SY xvec13,0*SIZE(C0,ldc,2); +ST_SY xvec5,4*SIZE(C1,ldc,2); +ST_SY xvec12,0*SIZE(C0,%rax,1); +ST_SY xvec4,4*SIZE(C1,%rax,1); +ST_SY xvec11,0*SIZE(C1); +ST_SY xvec3,4*SIZE(C0); +ST_SY xvec10,0*SIZE(C1,ldc,1); +ST_SY xvec2,4*SIZE(C0,ldc,1); +ST_SY xvec9,0*SIZE(C1,ldc,2); +ST_SY xvec1,4*SIZE(C0,ldc,2); +ST_SY xvec8,0*SIZE(C1,%rax,1); +ST_SY xvec0,4*SIZE(C0,%rax,1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif +ADDQ $8*SIZE,C0; +ADDQ $8*SIZE,C1; +.L1_bodyE:; +DECQ i; +JG .L1_bodyB; +JMP .L1_loopE; +ALIGN_4; +.L4_loopEx: +LEAQ (ldc,ldc,2),%rax; +EXTRA_SY $1, yvec15, xvec7; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C0), xvec6, xvec6; +LDH_SY 2*SIZE(C0), xvec6, xvec6; +ADD_SY xvec6, xvec15, xvec15; +#endif +STL_SY xvec15, 0*SIZE(C0); +STH_SY xvec15, 2*SIZE(C0); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C1), xvec5, xvec5; +LDH_SY 6*SIZE(C1), xvec5, xvec5; +ADD_SY xvec5, xvec7, xvec7; +#endif +STL_SY xvec7, 4*SIZE(C1); +STH_SY xvec7, 6*SIZE(C1); + +EXTRA_SY $1, yvec14, xvec6; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C0, ldc, 1), xvec5, xvec5; +LDH_SY 2*SIZE(C0, ldc, 1), xvec5, xvec5; +ADD_SY xvec5, xvec14, xvec14; +#endif +STL_SY xvec14, 0*SIZE(C0, ldc, 1); +STH_SY xvec14, 2*SIZE(C0, ldc, 1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C1, ldc, 1), xvec4, xvec4; +LDH_SY 6*SIZE(C1, ldc, 1), xvec4, xvec4; +ADD_SY xvec4, xvec6, xvec6; +#endif +STL_SY xvec6, 4*SIZE(C1, ldc, 1); +STH_SY xvec6, 6*SIZE(C1, ldc, 1); + +EXTRA_SY $1, yvec13, xvec5; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C0, ldc, 2), xvec4, xvec4; +LDH_SY 2*SIZE(C0, ldc, 2), xvec4, xvec4; +ADD_SY xvec4, xvec13, xvec13; +#endif +STL_SY xvec13, 0*SIZE(C0, ldc, 2); +STH_SY xvec13, 2*SIZE(C0, ldc, 2); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C1, ldc, 2), xvec3, xvec3; +LDH_SY 6*SIZE(C1, ldc, 2), xvec3, xvec3; +ADD_SY xvec3, xvec5, xvec5; +#endif +STL_SY xvec5, 4*SIZE(C1, ldc, 2); +STH_SY xvec5, 6*SIZE(C1, ldc, 2); + +EXTRA_SY $1, yvec12, xvec4; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C0, %rax, 1), xvec3, xvec3; +LDH_SY 2*SIZE(C0, %rax, 1), xvec3, xvec3; +ADD_SY xvec3, xvec12, xvec12; +#endif +STL_SY xvec12, 0*SIZE(C0, %rax, 1); +STH_SY xvec12, 2*SIZE(C0, %rax, 1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C1, %rax, 1), xvec2, xvec2; +LDH_SY 6*SIZE(C1, %rax, 1), xvec2, xvec2; +ADD_SY xvec2, xvec4, xvec4; +#endif +STL_SY xvec4, 4*SIZE(C1, %rax, 1); +STH_SY xvec4, 6*SIZE(C1, %rax, 1); + +EXTRA_SY $1, yvec11, xvec3; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C1), xvec2, xvec2; +LDH_SY 2*SIZE(C1), xvec2, xvec2; +ADD_SY xvec2, xvec11, xvec11; +#endif +STL_SY xvec11, 0*SIZE(C1); +STH_SY xvec11, 2*SIZE(C1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C0), xvec1, xvec1; +LDH_SY 6*SIZE(C0), xvec1, xvec1; +ADD_SY xvec1, xvec3, xvec3; +#endif +STL_SY xvec3, 4*SIZE(C0); +STH_SY xvec3, 6*SIZE(C0); + +EXTRA_SY $1, yvec10, xvec2; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C1, ldc, 1), xvec1, xvec1; +LDH_SY 2*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_SY xvec1, xvec10, xvec10; +#endif +STL_SY xvec10, 0*SIZE(C1, ldc, 1); +STH_SY xvec10, 2*SIZE(C1, ldc, 1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C0, ldc, 1), xvec0, xvec0; +LDH_SY 6*SIZE(C0, ldc, 1), xvec0, xvec0; +ADD_SY xvec0, xvec2, xvec2; +#endif +STL_SY xvec2, 4*SIZE(C0, ldc, 1); +STH_SY xvec2, 6*SIZE(C0, ldc, 1); + +EXTRA_SY $1, yvec9, xvec1; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C1, ldc, 2), xvec0, xvec0; +LDH_SY 2*SIZE(C1, ldc, 2), xvec0, xvec0; +ADD_SY xvec0, xvec9, xvec9; +#endif +STL_SY xvec9, 0*SIZE(C1, ldc, 2); +STH_SY xvec9, 2*SIZE(C1, ldc, 2); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C0, ldc, 2), xvec7, xvec7; +LDH_SY 6*SIZE(C0, ldc, 2), xvec7, xvec7; +ADD_SY xvec7, xvec1, xvec1; +#endif +STL_SY xvec1, 4*SIZE(C0, ldc, 2); +STH_SY xvec1, 6*SIZE(C0, ldc, 2); + +EXTRA_SY $1, yvec8, xvec0; +#ifndef TRMMKERNEL +LDL_SY 0*SIZE(C1, %rax, 1), xvec6, xvec6; +LDH_SY 2*SIZE(C1, %rax, 1), xvec6, xvec6; +ADD_SY xvec6, xvec8, xvec8; +#endif +STL_SY xvec8, 0*SIZE(C1, %rax, 1); +STH_SY xvec8, 2*SIZE(C1, %rax, 1); +#ifndef TRMMKERNEL +LDL_SY 4*SIZE(C0, %rax, 1), xvec5, xvec5; +LDH_SY 6*SIZE(C0, %rax, 1), xvec5, xvec5; +ADD_SY xvec5, xvec0, xvec0; +#endif +STL_SY xvec0, 4*SIZE(C0, %rax, 1); +STH_SY xvec0, 6*SIZE(C0, %rax, 1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L1_bodyB; +ALIGN_4 +.L1_loopE:; +TEST $4, bm; +JLE .L5_loopE; +ALIGN_4 +.L5_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#### Initial Results Register #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +LD_SX 0*SIZE(ptrba), xvec0; +XOR_SY yvec11, yvec11, yvec11; +XOR_SY yvec10, yvec10, yvec10; +EDUP_SX 0*SIZE(ptrbb), xvec2; +XOR_SY yvec9, yvec9, yvec9; +XOR_SY yvec8, yvec8, yvec8; +ODUP_SX 0*SIZE(ptrbb), xvec3; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $8, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L8_loopE; +ALIGN_4 +.L8_bodyB: + +#### Unroll time 1 #### +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +ODUP_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 4*SIZE(ptrba), xvec1; +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; + +EDUP_SX 8*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +ODUP_SX 8*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; + +#### Unroll time 2 #### +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 12*SIZE(ptrbb), xvec2; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +ODUP_SX 12*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 8*SIZE(ptrba), xvec0; +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; + +EDUP_SX 16*SIZE(ptrbb), xvec2; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +ODUP_SX 16*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; + +#### Unroll time 3 #### +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 20*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +ODUP_SX 20*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 12*SIZE(ptrba), xvec1; +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; + +EDUP_SX 24*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +ODUP_SX 24*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; +ADDQ $16*SIZE, ptrba; + +#### Unroll time 4 #### +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 28*SIZE(ptrbb), xvec2; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +ODUP_SX 28*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $32*SIZE, ptrbb; + +LD_SX 0*SIZE(ptrba), xvec0; +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; + +EDUP_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +ODUP_SX 0*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; +DECQ k; +JG .L8_bodyB; +ALIGN_4 +.L8_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L9_loopE; +ALIGN_4 +.L9_bodyB: +#### Unroll time 1 #### +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +ODUP_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 4*SIZE(ptrba), xvec1; +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; + +EDUP_SX 8*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +ODUP_SX 8*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; + +#### Unroll time 2 #### +ADDQ $8*SIZE, ptrba; +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 12*SIZE(ptrbb), xvec2; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +ODUP_SX 12*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $16*SIZE, ptrbb; + +LD_SX 0*SIZE(ptrba), xvec0; +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; + +EDUP_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +ODUP_SX 0*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; + +.L9_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L10_loopE; +ALIGN_4 +.L10_bodyB: +#### Unroll time 1 #### +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; +ADDQ $4*SIZE, ptrba; + +EDUP_SX 4*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +ODUP_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $8*SIZE, ptrbb; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; + +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; + +.L10_loopE: +#### Multiply Alpha #### +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; +#### Reverse Result #### +MOV_SX xvec15, xvec7; +REVS_SX $0xe4, xvec13, xvec15, xvec15; +REVS_SX $0xe4, xvec7, xvec13, xvec13; +MOV_SX xvec14, xvec7; +REVS_SX $0xe4, xvec12, xvec14, xvec14; +REVS_SX $0xe4, xvec7, xvec12, xvec12; +MOV_SX xvec11, xvec7; +REVS_SX $0xe4, xvec9, xvec11, xvec11; +REVS_SX $0xe4, xvec7, xvec9, xvec9; +MOV_SX xvec10, xvec7; +REVS_SX $0xe4, xvec8, xvec10, xvec10; +REVS_SX $0xe4, xvec7, xvec8, xvec8; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L10_loopEx; +ALIGN_4 +LEAQ (ldc,ldc,2),%rax; +#ifndef TRMMKERNEL +ADD_SX 0*SIZE(C0), xvec15, xvec15; +ADD_SX 0*SIZE(C0, ldc,1), xvec14, xvec14; +ADD_SX 0*SIZE(C0, ldc,2), xvec13, xvec13; +ADD_SX 0*SIZE(C0, %rax,1), xvec12, xvec12; +ADD_SX 0*SIZE(C1), xvec11, xvec11; +ADD_SX 0*SIZE(C1, ldc,1), xvec10, xvec10; +ADD_SX 0*SIZE(C1, ldc,2), xvec9, xvec9; +ADD_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; +#endif +ST_SX xvec15, 0*SIZE(C0); +ST_SX xvec14, 0*SIZE(C0, ldc, 1); +ST_SX xvec13, 0*SIZE(C0, ldc, 2); +ST_SX xvec12, 0*SIZE(C0, %rax, 1); +ST_SX xvec11, 0*SIZE(C1); +ST_SX xvec10, 0*SIZE(C1, ldc, 1); +ST_SX xvec9, 0*SIZE(C1, ldc, 2); +ST_SX xvec8, 0*SIZE(C1, %rax, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +JMP .L5_loopE; +ALIGN_4 +.L10_loopEx: +LEAQ (ldc,ldc,2),%rax; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec7, xvec7; +LDH_SX 2*SIZE(C0), xvec7, xvec7; +LDL_SX 0*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 0*SIZE(C0, ldc, 2), xvec5, xvec5; +LDH_SX 2*SIZE(C0, ldc, 2), xvec5, xvec5; +LDL_SX 0*SIZE(C0, %rax, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, %rax, 1), xvec4, xvec4; +LDL_SX 0*SIZE(C1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +LDL_SX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 2), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 2), xvec1, xvec1; +LDL_SX 0*SIZE(C1, %rax, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, %rax, 1), xvec0, xvec0; +ADD_SX xvec7, xvec15, xvec15; +ADD_SX xvec6, xvec14, xvec14; +ADD_SX xvec5, xvec13, xvec13; +ADD_SX xvec4, xvec12, xvec12; +ADD_SX xvec3, xvec11, xvec11; +ADD_SX xvec2, xvec10, xvec10; +ADD_SX xvec1, xvec9, xvec9; +ADD_SX xvec0, xvec8, xvec8; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C0); +STL_SX xvec14, 0*SIZE(C0, ldc, 1); +STH_SX xvec14, 2*SIZE(C0, ldc, 1); +STL_SX xvec13, 0*SIZE(C0, ldc, 2); +STH_SX xvec13, 2*SIZE(C0, ldc, 2); +STL_SX xvec12, 0*SIZE(C0, %rax, 1); +STH_SX xvec12, 2*SIZE(C0, %rax, 1); +STL_SX xvec11, 0*SIZE(C1); +STH_SX xvec11, 2*SIZE(C1); +STL_SX xvec10, 0*SIZE(C1, ldc, 1); +STH_SX xvec10, 2*SIZE(C1, ldc, 1); +STL_SX xvec9, 0*SIZE(C1, ldc, 2); +STH_SX xvec9, 2*SIZE(C1, ldc, 2); +STL_SX xvec8, 0*SIZE(C1, %rax, 1); +STH_SX xvec8, 2*SIZE(C1, %rax, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +.L5_loopE: +TEST $2, bm; +JLE .L6_loopE; +ALIGN_4 +.L6_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb +#endif +#### Initial Results Register #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +MOVQ bk, k; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $8, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L11_loopE; +ALIGN_4 +.L11_bodyB: +#### Computing kernel +LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 +SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 +EDUP_SX 0*SIZE(ptrbb), xvec2; +ODUP_SX 0*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xee, xvec0, xvec6; +EDUP_SX 8*SIZE(ptrbb), xvec2; +ODUP_SX 8*SIZE(ptrbb), xvec3; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 12*SIZE(ptrbb), xvec4; +ODUP_SX 12*SIZE(ptrbb), xvec5; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 4*SIZE(ptrba), xvec0; +SHUF_SX $0x44, xvec0, xvec1; +EDUP_SX 16*SIZE(ptrbb), xvec2; +ODUP_SX 16*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 20*SIZE(ptrbb), xvec4; +ODUP_SX 20*SIZE(ptrbb), xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xee, xvec0, xvec6; +EDUP_SX 24*SIZE(ptrbb), xvec2; +ODUP_SX 24*SIZE(ptrbb), xvec3; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 28*SIZE(ptrbb), xvec4; +ODUP_SX 28*SIZE(ptrbb), xvec5; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +ADDQ $8*SIZE, ptrba; +ADDQ $32*SIZE, ptrbb; +DECQ k; +JG .L11_bodyB; +ALIGN_4 +.L11_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L12_loopE; +ALIGN_4 +.L12_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 +SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 +EDUP_SX 0*SIZE(ptrbb), xvec2; +ODUP_SX 0*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +SHUF_SX $0xee, xvec0, xvec6; +EDUP_SX 8*SIZE(ptrbb), xvec2; +ODUP_SX 8*SIZE(ptrbb), xvec3; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 12*SIZE(ptrbb), xvec4; +ODUP_SX 12*SIZE(ptrbb), xvec5; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $4*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; + +.L12_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L13_loopE; +ALIGN_4 +.L13_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 +SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 +EDUP_SX 0*SIZE(ptrbb), xvec2; +ODUP_SX 0*SIZE(ptrbb), xvec3; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +EDUP_SX 4*SIZE(ptrbb), xvec4; +ODUP_SX 4*SIZE(ptrbb), xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $2*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +.L13_loopE: +LEAQ (ldc,ldc,2),%rax; +#### Multiply Alpha #### +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec11, xvec11; +LDH_SX 0*SIZE(C0, ldc, 2), xvec11, xvec11; +LDL_SX 0*SIZE(C0, ldc, 1), xvec10, xvec10; +LDH_SX 0*SIZE(C0, %rax, 1), xvec10, xvec10; +LDL_SX 0*SIZE(C1), xvec9, xvec9; +LDH_SX 0*SIZE(C1, ldc, 2), xvec9, xvec9; +LDL_SX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; +ADD_SX xvec11, xvec15, xvec15; +ADD_SX xvec10, xvec14, xvec14; +ADD_SX xvec9, xvec13, xvec13; +ADD_SX xvec8, xvec12, xvec12; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 0*SIZE(C0, ldc, 2); +STL_SX xvec14, 0*SIZE(C0, ldc, 1); +STH_SX xvec14, 0*SIZE(C0, %rax, 1); +STL_SX xvec13, 0*SIZE(C1); +STH_SX xvec13, 0*SIZE(C1, ldc, 2); +STL_SX xvec12, 0*SIZE(C1, ldc, 1); +STH_SX xvec12, 0*SIZE(C1, %rax, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +#### Writing Back #### +.L6_loopE: +TEST $1, bm; +JLE .L7_loopE; +ALIGN_4 +.L7_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#### intitial #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +MOVQ bk, k; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $8, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L14_loopE; +ALIGN_4 +.L14_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +LD_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +BROAD_SX 1*SIZE(ptrba), xvec1; +LD_SX 8*SIZE(ptrbb), xvec4; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +LD_SX 12*SIZE(ptrbb), xvec5; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; + +BROAD_SX 2*SIZE(ptrba), xvec0; +LD_SX 16*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +LD_SX 20*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +BROAD_SX 3*SIZE(ptrba), xvec1; +LD_SX 24*SIZE(ptrbb), xvec4; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +LD_SX 28*SIZE(ptrbb), xvec5; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; + +ADDQ $4*SIZE, ptrba; +ADDQ $32*SIZE, ptrbb; +DECQ k; +JG .L14_bodyB; +ALIGN_4 +.L14_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L15_loopE; +ALIGN_4 +.L15_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +LD_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +BROAD_SX 1*SIZE(ptrba), xvec1; +LD_SX 8*SIZE(ptrbb), xvec4; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +LD_SX 12*SIZE(ptrbb), xvec5; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; +ADDQ $2*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; + +.L15_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L16_loopE; +ALIGN_4 +.L16_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +LD_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; +ADDQ $1, ptrba; +ADDQ $4, ptrbb; + +.L16_loopE: +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; + +LEAQ (ldc,ldc,2),%rax; +SHUF_SX $0xff, xvec15, xvec13; +SHUF_SX $0xaa, xvec15, xvec12; +SHUF_SX $0x55, xvec15, xvec11; +SHUF_SX $0x00, xvec15, xvec10; + +#ifndef TRMMKERNEL +addss 0*SIZE(C0), xvec10; +addss 0*SIZE(C0, ldc, 1), xvec11; +addss 0*SIZE(C0, ldc, 2), xvec12; +addss 0*SIZE(C0, %rax, 1), xvec13; +#endif +movss xvec10, 0*SIZE(C0); +movss xvec11, 0*SIZE(C0, ldc, 1); +movss xvec12, 0*SIZE(C0, ldc, 2); +movss xvec13, 0*SIZE(C0, %rax, 1); + +SHUF_SX $0xff, xvec14, xvec9; +SHUF_SX $0xaa, xvec14, xvec8; +SHUF_SX $0x55, xvec14, xvec7; +SHUF_SX $0x00, xvec14, xvec6; + +#ifndef TRMMKERNEL +addss 0*SIZE(C1), xvec6; +addss 0*SIZE(C1, ldc, 1), xvec7; +addss 0*SIZE(C1, ldc, 2), xvec8; +addss 0*SIZE(C1, %rax, 1), xvec9; +#endif +movss xvec6, 0*SIZE(C1); +movss xvec7, 0*SIZE(C1, ldc, 1); +movss xvec8, 0*SIZE(C1, ldc, 2); +movss xvec9, 0*SIZE(C1, %rax, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 8), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $1, kk +#endif +ADDQ $1*SIZE, C0; +ADDQ $1*SIZE, C1; +#### Writing Back #### +.L7_loopE: +#if defined(TRMMKERNEL)&&!defined(LEFT) +ADDQ $8, kk +#endif +MOVQ bk,k; +SALQ $5,k; +ADDQ k,bb; +LEAQ (C,ldc,8),C; +.L0_bodyE:; +DECQ j; +JG .L0_bodyB; +ALIGN_4; +.L0_loopE:; +TEST $4, bn; # Rn = 4 +JLE .L20_loopE; +ALIGN_4; +.L20_bodyB: +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif + +MOVQ C, C0; +LEAQ (C, ldc, 2), C1; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $3, i; +JLE .L21_loopE; +ALIGN_4 +.L21_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#### Initial #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +EDUP_SX 0*SIZE(ptrbb), xvec2; +XOR_SY yvec11, yvec11, yvec11; +XOR_SY yvec10, yvec10, yvec10; +LD_SX 0*SIZE(ptrba), xvec0; +XOR_SY yvec9, yvec9, yvec9; +XOR_SY yvec8, yvec8, yvec8; +LD_SX 4*SIZE(ptrba), xvec1; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2,k; +JLE .L211_loopE; +ALIGN_4 +.L211_bodyB: +#### Unroll time 1 #### +ODUP_SX 0*SIZE(ptrbb), xvec3; +SHUF_SX $0x4e, xvec2, xvec4; +MOV_SX xvec2, xvec6; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +SHUF_SX $0x4e, xvec3, xvec5; +MOV_SX xvec3, xvec7; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; + +EDUP_SX 4*SIZE(ptrbb), xvec2; +MOV_SX xvec4, xvec6; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; + +MOV_SX xvec5, xvec7; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +LD_SX 8*SIZE(ptrba), xvec0; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; +LD_SX 12*SIZE(ptrba), xvec1; + +#### Unroll time 2 #### +ODUP_SX 4*SIZE(ptrbb), xvec3; +SHUF_SX $0x4e, xvec2, xvec4; +MOV_SX xvec2, xvec6; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +SHUF_SX $0x4e, xvec3, xvec5; +MOV_SX xvec3, xvec7; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; + +EDUP_SX 8*SIZE(ptrbb), xvec2; +MOV_SX xvec4, xvec6; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; + +MOV_SX xvec5, xvec7; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +LD_SX 16*SIZE(ptrba), xvec0; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; +LD_SX 20*SIZE(ptrba), xvec1; + +#### Unroll time 3 #### +ODUP_SX 8*SIZE(ptrbb), xvec3; +SHUF_SX $0x4e, xvec2, xvec4; +MOV_SX xvec2, xvec6; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +SHUF_SX $0x4e, xvec3, xvec5; +MOV_SX xvec3, xvec7; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; + +EDUP_SX 12*SIZE(ptrbb), xvec2; +MOV_SX xvec4, xvec6; +ADDQ $16*SIZE, ptrbb; + +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; + +MOV_SX xvec5, xvec7; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +LD_SX 24*SIZE(ptrba), xvec0; + +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; +LD_SX 28*SIZE(ptrba), xvec1; +ADDQ $32*SIZE, ptrba; + +#### Unroll time 4 #### +ODUP_SX -4*SIZE(ptrbb), xvec3; +SHUF_SX $0x4e, xvec2, xvec4; +MOV_SX xvec2, xvec6; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +SHUF_SX $0x4e, xvec3, xvec5; +MOV_SX xvec3, xvec7; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; + +EDUP_SX 0*SIZE(ptrbb), xvec2; +MOV_SX xvec4, xvec6; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; + +MOV_SX xvec5, xvec7; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +LD_SX 0*SIZE(ptrba), xvec0; + +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; +LD_SX 4*SIZE(ptrba), xvec1; +DECQ k; +JG .L211_bodyB; +ALIGN_4 +.L211_loopE: +#ifndef TRMMKERNEL +TEST $2, bk +#else +TEST $2, kkk; +#endif +JLE .L212_loopE; +ALIGN_4 +.L212_bodyB: +#### Unroll time 1 #### +ODUP_SX 0*SIZE(ptrbb), xvec3; +SHUF_SX $0x4e, xvec2, xvec4; +MOV_SX xvec2, xvec6; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +SHUF_SX $0x4e, xvec3, xvec5; +MOV_SX xvec3, xvec7; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; + +EDUP_SX 4*SIZE(ptrbb), xvec2; +MOV_SX xvec4, xvec6; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +ADDQ $8*SIZE, ptrbb; + +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; +MOV_SX xvec5, xvec7; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +LD_SX 8*SIZE(ptrba), xvec0; + +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; +LD_SX 12*SIZE(ptrba), xvec1; +ADDQ $16*SIZE, ptrba; + +#### Unroll time 2 #### +ODUP_SX -4*SIZE(ptrbb), xvec3; +SHUF_SX $0x4e, xvec2, xvec4; +MOV_SX xvec2, xvec6; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +SHUF_SX $0x4e, xvec3, xvec5; +MOV_SX xvec3, xvec7; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; + +EDUP_SX 0*SIZE(ptrbb), xvec2; +MOV_SX xvec4, xvec6; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; + +MOV_SX xvec5, xvec7; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +LD_SX 0*SIZE(ptrba), xvec0; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; +LD_SX 4*SIZE(ptrba), xvec1; + +.L212_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L213_loopE; +ALIGN_4 +.L213_bodyB: +ODUP_SX 0*SIZE(ptrbb), xvec3; +SHUF_SX $0x4e, xvec2, xvec4; +MOV_SX xvec2, xvec6; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +ADDQ $4*SIZE, ptrbb; + +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; +MOV_SX xvec3, xvec7; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; + +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; +MOV_SX xvec4, xvec6; +ADDQ $8*SIZE, ptrba; + +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; + +MOV_SX xvec5, xvec7; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; + +.L213_loopE: +#### Multiply Alpha #### +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; +#### Writing Back #### +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 4*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_SX 6*SIZE(C1, ldc, 1), xvec3, xvec3; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 0*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 4*SIZE(C1, ldc, 1), xvec7, xvec7; +LDH_SX 6*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C1); +STL_SX xvec14, 4*SIZE(C0); +STH_SX xvec14, 6*SIZE(C1); +STL_SX xvec13, 0*SIZE(C0, ldc, 1); +STH_SX xvec13, 2*SIZE(C1, ldc, 1); +STL_SX xvec12, 4*SIZE(C0, ldc, 1); +STH_SX xvec12, 6*SIZE(C1, ldc, 1); +STL_SX xvec11, 0*SIZE(C1); +STH_SX xvec11, 2*SIZE(C0); +STL_SX xvec10, 4*SIZE(C1); +STH_SX xvec10, 6*SIZE(C0); +STL_SX xvec9, 0*SIZE(C1, ldc, 1); +STH_SX xvec9, 2*SIZE(C0, ldc, 1); +STL_SX xvec8, 4*SIZE(C1, ldc, 1); +STH_SX xvec8, 6*SIZE(C0, ldc, 1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L21_bodyB; +ALIGN_4 +.L21_loopE: +TEST $4, bm; +JLE .L22_loopE; +ALIGN_4 +.L22_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#### Initial Results #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L221_loopE; +ALIGN_4 +.L221_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +EDUP_SX 0*SIZE(ptrbb), xvec2; +ODUP_SX 0*SIZE(ptrbb), xvec3; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 4*SIZE(ptrba), xvec1; +EDUP_SX 4*SIZE(ptrbb), xvec2; +ODUP_SX 4*SIZE(ptrbb), xvec3; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 8*SIZE(ptrba), xvec0; +EDUP_SX 8*SIZE(ptrbb), xvec2; +ODUP_SX 8*SIZE(ptrbb), xvec3; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 12*SIZE(ptrba), xvec1; +EDUP_SX 12*SIZE(ptrbb), xvec2; +ODUP_SX 12*SIZE(ptrbb), xvec3; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15 +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $16*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; + +DECQ k; +JG .L221_bodyB; +ALIGN_4 +.L221_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L222_loopE; +ALIGN_4 +.L222_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +EDUP_SX 0*SIZE(ptrbb), xvec2; +ODUP_SX 0*SIZE(ptrbb), xvec3; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; + +LD_SX 4*SIZE(ptrba), xvec1; +EDUP_SX 4*SIZE(ptrbb), xvec2; +ODUP_SX 4*SIZE(ptrbb), xvec3; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13 +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $8*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +.L222_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L223_loopE; +ALIGN_4 +.L223_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +EDUP_SX 0*SIZE(ptrbb), xvec2; +ODUP_SX 0*SIZE(ptrbb), xvec3; + +SHUF_SX $0x4e, xvec2, xvec4; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +SHUF_SX $0x4e, xvec3, xvec5; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; +ADDQ $4*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +.L223_loopE: +#### Multiply Alpha #### +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +#### Writing back #### +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C0, ldc, 1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C1); +STL_SX xvec14, 0*SIZE(C0, ldc, 1); +STH_SX xvec14, 2*SIZE(C1, ldc, 1); +STL_SX xvec13, 0*SIZE(C1); +STH_SX xvec13, 2*SIZE(C0); +STL_SX xvec12, 0*SIZE(C1, ldc, 1); +STH_SX xvec12, 2*SIZE(C0, ldc, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +.L22_loopE: +TEST $2, bm; +JLE .L23_loopE; +ALIGN_4 +.L23_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb +#endif +#### Initial #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L231_loopE; +ALIGN_4 +.L231_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x44, xvec0, xvec1; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; + +SHUF_SX $0xee, xvec0, xvec2; +EDUP_SX 4*SIZE(ptrbb), xvec6; +ODUP_SX 4*SIZE(ptrbb), xvec7; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; + +LD_SX 4*SIZE(ptrba), xvec0; +EDUP_SX 8*SIZE(ptrbb), xvec4; +ODUP_SX 8*SIZE(ptrbb), xvec5; +SHUF_SX $0x44, xvec0, xvec1; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; + +SHUF_SX $0xee, xvec0, xvec2; +EDUP_SX 12*SIZE(ptrbb), xvec6; +ODUP_SX 12*SIZE(ptrbb), xvec7; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; + +ADDQ $8*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L231_bodyB; +ALIGN_4 +.L231_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L232_loopE; +ALIGN_4 +.L232_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x44, xvec0, xvec1; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; + +SHUF_SX $0xee, xvec0, xvec2; +EDUP_SX 4*SIZE(ptrbb), xvec6; +ODUP_SX 4*SIZE(ptrbb), xvec7; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; + +ADDQ $4*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +.L232_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L233_loopE; +ALIGN_4 +.L233_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +EDUP_SX 0*SIZE(ptrbb), xvec4; +ODUP_SX 0*SIZE(ptrbb), xvec5; +SHUF_SX $0x44, xvec0, xvec1; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; + +ADDQ $2*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +.L233_loopE: +#### Multiply Alpha #### +BROAD_SY MEMALPHA, yvec7; +MUL_SY xvec7, xvec15, xvec15; +MUL_SY xvec7, xvec14, xvec14; +#### Writing Back #### +SHUF_SX $0xee, xvec15, xvec13; +SHUF_SX $0xee, xvec14, xvec12; +#ifndef TRMMKERNEL +ADD_SY 0*SIZE(C0), xvec15, xvec15; +ADD_SY 0*SIZE(C0, ldc, 1), xvec14, xvec14; +ADD_SY 0*SIZE(C1), xvec13, xvec13; +ADD_SY 0*SIZE(C1, ldc, 1), xvec12, xvec12; +#endif +STL_SY xvec15, 0*SIZE(C0); +STL_SY xvec14, 0*SIZE(C0, ldc, 1); +STL_SY xvec13, 0*SIZE(C1); +STL_SY xvec12, 0*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +.L23_loopE: +TEST $1, bm; +JLE .L24_loopE; +ALIGN_4 +.L24_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#### Initial #### +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L241_loopE; +ALIGN_4 +.L241_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; + +BROAD_SX 1*SIZE(ptrba), xvec2; +LD_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; + +BROAD_SX 2*SIZE(ptrba), xvec4; +LD_SX 8*SIZE(ptrbb), xvec5; +MUL_SX xvec4, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; + +BROAD_SX 3*SIZE(ptrba), xvec6; +LD_SX 12*SIZE(ptrbb), xvec7; +MUL_SX xvec6, xvec7, xvec7; +ADD_SX xvec7, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L241_bodyB; +ALIGN_4 +.L241_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L242_loopE; +ALIGN_4 +.L242_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; + +BROAD_SX 1*SIZE(ptrba), xvec2; +LD_SX 4*SIZE(ptrbb), xvec3; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L242_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L243_loopE; +ALIGN_4; +.L243_bodyB: +BROAD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; +ADDQ $1*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +.L243_loopE: +#### Multiply Alpha #### +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +SHUF_SX $0xff, xvec15, xvec14; +SHUF_SX $0xaa, xvec15, xvec13; +SHUF_SX $0x55, xvec15, xvec12; +SHUF_SX $0x00, xvec15, xvec11; + +#ifndef TRMMKERNEL +addss 0*SIZE(C0), xvec11; +addss 0*SIZE(C0, ldc, 1), xvec12; +addss 0*SIZE(C1), xvec13; +addss 0*SIZE(C1, ldc, 1), xvec14; +#endif + +movss xvec11, 0*SIZE(C0); +movss xvec12, 0*SIZE(C0, ldc, 1); +movss xvec13, 0*SIZE(C1); +movss xvec14, 0*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $1, kk +#endif +ADDQ $1*SIZE, C0; +ADDQ $1*SIZE, C1; +.L24_loopE: +#if defined(TRMMKERNEL)&&!defined(LEFT) +ADDQ $4, kk +#endif +MOVQ bk, k; +SALQ $4, k; +ADDQ k, bb; +LEAQ (C, ldc, 4), C; +.L20_loopE: +TEST $2, bn; +JLE .L30_loopE; +ALIGN_4 +.L30_bodyB: +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk +#endif +MOVQ C, C0; +LEAQ (C, ldc, 1), C1; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $3, i; +JLE .L31_loopE; +ALIGN_4 +.L31_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#### Initial #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L311_loopE; +ALIGN_4 +.L311_bodyB: +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 4*SIZE(ptrba), xvec1; + +MOV_SX xvec3, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +MOV_SX xvec5, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; + +SHUF_SX $0xfa, xvec2, xvec3; +LD_SX 8*SIZE(ptrba), xvec0; +LD_SX 12*SIZE(ptrba), xvec1; + +MOV_SX xvec3, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +MOV_SX xvec5, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; + +LD_SX 4*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +LD_SX 16*SIZE(ptrba), xvec0; +LD_SX 20*SIZE(ptrba), xvec1; + +MOV_SX xvec3, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +MOV_SX xvec5, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; + +SHUF_SX $0xfa, xvec2, xvec3; +LD_SX 24*SIZE(ptrba), xvec0; +LD_SX 28*SIZE(ptrba), xvec1; + +MOV_SX xvec3, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +MOV_SX xvec5, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; + +ADDQ $32*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L311_bodyB; +ALIGN_4 +.L311_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L312_loopE; +ALIGN_4 +.L312_bodyB: +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 4*SIZE(ptrba), xvec1; + +MOV_SX xvec3, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +MOV_SX xvec5, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; + +SHUF_SX $0xfa, xvec2, xvec3; +LD_SX 8*SIZE(ptrba), xvec0; +LD_SX 12*SIZE(ptrba), xvec1; + +MOV_SX xvec3, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +MOV_SX xvec5, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; +ADDQ $16*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L312_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L313_loopE; +ALIGN_4 +.L313_bodyB: +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 4*SIZE(ptrba), xvec1; + +MOV_SX xvec3, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +SHUF_SX $0x4e, xvec4, xvec5; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +MOV_SX xvec5, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; +ADDQ $8*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L313_loopE: +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +#### Writing Back #### +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 4*SIZE(C1), xvec3, xvec3; +LDH_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C1); +STL_SX xvec14, 4*SIZE(C0); +STH_SX xvec14, 6*SIZE(C1); +STL_SX xvec13, 0*SIZE(C1); +STH_SX xvec13, 2*SIZE(C0); +STL_SX xvec12, 4*SIZE(C1); +STH_SX xvec12, 6*SIZE(C0); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $8, kk +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L31_bodyB; +ALIGN_4 +.L31_loopE: +TEST $4, bm; +JLE .L32_loopE; +ALIGN_4 +.L32_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#### Initial #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L321_loopE; +ALIGN_4 +.L321_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +SHUF_SX $0x05, xvec2, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +LD_SX 4*SIZE(ptrba), xvec0; +SHUF_SX $0xfa, xvec2, xvec5; +SHUF_SX $0xaf, xvec2, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +LD_SX 8*SIZE(ptrba), xvec0; +LD_SX 4*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +SHUF_SX $0x05, xvec2, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +LD_SX 12*SIZE(ptrba), xvec0; +SHUF_SX $0xfa, xvec2, xvec5; +SHUF_SX $0xaf, xvec2, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; + +ADDQ $16*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L321_bodyB; +ALIGN_4 +.L321_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L322_loopE; +ALIGN_4 +.L322_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +SHUF_SX $0x05, xvec2, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; + +LD_SX 4*SIZE(ptrba), xvec0; +SHUF_SX $0xfa, xvec2, xvec5; +SHUF_SX $0xaf, xvec2, xvec6; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; +ADDQ $8*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L322_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L323_loopE; +ALIGN_4 +.L323_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +LD_SX 0*SIZE(ptrbb), xvec2; +SHUF_SX $0x50, xvec2, xvec3; +SHUF_SX $0x05, xvec2, xvec4; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; +ADDQ $4*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L323_loopE: +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +#### Writing back #### +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C1); +STL_SX xvec14, 0*SIZE(C1); +STH_SX xvec14, 2*SIZE(C0); +#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +.L32_loopE: +TEST $2, bm; +JLE .L33_loopE; +ALIGN_4 +.L33_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#### Initial #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +XOR_SY yvec13, yvec13, yvec13; +XOR_SY yvec12, yvec12, yvec12; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L331_loopE; +ALIGN_4 +.L331_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 +EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 +ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 + +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +LD_SX 4*SIZE(ptrba), xvec0; +EDUP_SX 4*SIZE(ptrbb), xvec2; +ODUP_SX 4*SIZE(ptrbb), xvec3; + +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; +ADDQ $8*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L331_bodyB; +ALIGN_4 +.L331_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L332_loopE; +ALIGN_4 +.L332_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 +EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 +ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 + +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; + +ADDQ $4*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L332_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L333_loopE; +ALIGN_4 +.L333_bodyB: +movss 0*SIZE(ptrba), xvec0; +movss 1*SIZE(ptrba), xvec1; +movss 0*SIZE(ptrbb), xvec2; +XOR_SY yvec3, yvec3, yvec3; +movss xvec2, xvec3; +mulss xvec0, xvec2; +addss xvec2, xvec15; +mulss xvec1, xvec3; +SHUF_SX $0xe1, xvec3, xvec4; +ADD_SX xvec4, xvec15, xvec15; + +movss 1*SIZE(ptrbb), xvec5; +XOR_SY yvec6, yvec6, yvec6; +movss xvec5, xvec6; +mulss xvec0, xvec5; +addss xvec5, xvec14; +mulss xvec1, xvec6; +SHUF_SX $0xe1, xvec6, xvec7; +ADD_SX xvec7, xvec14, xvec14 + +ADDQ $2*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; +.L333_loopE: +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +SHUF_SX $0xee, xvec15, xvec13; +SHUF_SX $0xee, xvec14, xvec12; +SHUF_SX $0x44, xvec15, xvec11; +SHUF_SX $0x44, xvec14, xvec10; +ADD_SX xvec13, xvec11, xvec11; +ADD_SX xvec12, xvec10, xvec10; + +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec11, xvec11; +ADD_SX xvec1, xvec10, xvec10; +#endif +STL_SX xvec11, 0*SIZE(C0); +STL_SX xvec10, 0*SIZE(C1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +#### Writing Back #### +.L33_loopE: +TEST $1, bm; +JLE .L34_loopE; +ALIGN_4 +.L34_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#### Initial #### +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L341_loopE; +ALIGN_4 +.L341_bodyB: +movss 0*SIZE(ptrba), xvec0; +movss 0*SIZE(ptrbb), xvec1; +mulss xvec0, xvec1; +addss xvec1, xvec15; + +movss 1*SIZE(ptrbb), xvec2; +mulss xvec0, xvec2; +addss xvec2, xvec14; + +movss 1*SIZE(ptrba), xvec0; +movss 2*SIZE(ptrbb), xvec1; +mulss xvec0, xvec1; +addss xvec1, xvec15; + +movss 3*SIZE(ptrbb), xvec2; +mulss xvec0, xvec2; +addss xvec2, xvec14; + +movss 2*SIZE(ptrba), xvec0; +movss 4*SIZE(ptrbb), xvec1; +mulss xvec0, xvec1; +addss xvec1, xvec15; + +movss 5*SIZE(ptrbb), xvec2; +mulss xvec0, xvec2; +addss xvec2, xvec14; + +movss 3*SIZE(ptrba), xvec0; +movss 6*SIZE(ptrbb), xvec1; +mulss xvec0, xvec1; +addss xvec1, xvec15; + +movss 7*SIZE(ptrbb), xvec2; +mulss xvec0, xvec2; +addss xvec2, xvec14; + +addq $4*SIZE, ptrba; +addq $8*SIZE, ptrbb; +decq k; +jg .L341_bodyB; +ALIGN_4 +.L341_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L342_loopE; +ALIGN_4 +.L342_bodyB: +movss 0*SIZE(ptrba), xvec0; +movss 0*SIZE(ptrbb), xvec1; +mulss xvec0, xvec1; +addss xvec1, xvec15; + +movss 1*SIZE(ptrbb), xvec2; +mulss xvec0, xvec2; +addss xvec2, xvec14; + +movss 1*SIZE(ptrba), xvec0; +movss 2*SIZE(ptrbb), xvec1; +mulss xvec0, xvec1; +addss xvec1, xvec15; + +movss 3*SIZE(ptrbb), xvec2; +mulss xvec0, xvec2; +addss xvec2, xvec14; +addq $2*SIZE, ptrba; +addq $4*SIZE, ptrbb; +.L342_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L343_loopE; +ALIGN_4 +.L343_bodyB: +movss 0*SIZE(ptrba), xvec0; +movss 0*SIZE(ptrbb), xvec1; +mulss xvec0, xvec1; +addss xvec1, xvec15; + +movss 1*SIZE(ptrbb), xvec2; +mulss xvec0, xvec2; +addss xvec2, xvec14; +addq $1*SIZE, ptrba; +addq $2*SIZE, ptrbb + +.L343_loopE: +#### Writing back #### +movss MEMALPHA, xvec7; +mulss xvec7, xvec15; +mulss xvec7, xvec14; +movss 0*SIZE(C0), xvec0; +movss 0*SIZE(C1), xvec1; +#ifndef TRMMKERNEL +addss xvec0, xvec15; +addss xvec1, xvec14; +#endif +movss xvec15, 0*SIZE(C0); +movss xvec14, 0*SIZE(C1); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif +addq $1*SIZE, C0; +addq $1*SIZE, C1; +.L34_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $2, kk; +#endif +MOVQ bk, k; +SALQ $3, k; +ADDQ k, bb; +LEAQ (C, ldc, 2), C; +.L30_loopE: +TEST $1, bn; +JLE .L40_loopE; +ALIGN_4 +.L40_bodyB: +#if defined(TRMMKERNEL)&&defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ C, C0; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $3, i; +JLE .L41_loopE; +ALIGN_4 +.L41_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +ADDQ %rax, ptrbb; +#endif +#### initial #### +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $8, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L411_loopE; +ALIGN_4 +.L411_bodyB: +LD_SY 0*SIZE(ptrba), yvec0; +BROAD_SY 0*SIZE(ptrbb), yvec1; +MUL_SY yvec0, yvec1, yvec2; +ADD_SY yvec2, yvec15, yvec15; + +LD_SY 8*SIZE(ptrba), yvec0; +BROAD_SY 1*SIZE(ptrbb), yvec1; +MUL_SY yvec0, yvec1, yvec2; +ADD_SY yvec2, yvec15, yvec15; + +LD_SY 16*SIZE(ptrba), yvec0; +BROAD_SY 2*SIZE(ptrbb), yvec1; +MUL_SY yvec0, yvec1, yvec2; +ADD_SY yvec2, yvec15, yvec15; + +LD_SY 24*SIZE(ptrba), yvec0; +BROAD_SY 3*SIZE(ptrbb), yvec1; +MUL_SY yvec0, yvec1, yvec2; +ADD_SY yvec2, yvec15, yvec15; + +ADDQ $32*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +DECQ k; +JG .L411_bodyB; +ALIGN_4 +.L411_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L412_loopE; +ALIGN_4 +.L412_bodyB: +LD_SY 0*SIZE(ptrba), yvec0; +BROAD_SY 0*SIZE(ptrbb), yvec1; +MUL_SY yvec0, yvec1, yvec2; +ADD_SY yvec2, yvec15, yvec15; + +LD_SY 8*SIZE(ptrba), yvec0; +BROAD_SY 1*SIZE(ptrbb), yvec1; +MUL_SY yvec0, yvec1, yvec2; +ADD_SY yvec2, yvec15, yvec15; + +ADDQ $16*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; +.L412_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L413_loopE; +ALIGN_4 +.L413_bodyB: +LD_SY 0*SIZE(ptrba), yvec0; +BROAD_SY 0*SIZE(ptrbb), yvec1; +MUL_SY yvec0, yvec1, yvec2; +ADD_SY yvec2, yvec15, yvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $1*SIZE, ptrbb; +.L413_loopE: +#### Writing #### +BROAD_SY MEMALPHA, yvec7; +MUL_SY yvec7, yvec15, yvec15; +EXTRA_SY $1, yvec15, xvec14; +SHUF_SX $0x44, xvec15, xvec13; +SHUF_SX $0xee, xvec15, xvec12; +SHUF_SX $0x44, xvec14, xvec11; +SHUF_SX $0xee, xvec14, xvec10; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 4*SIZE(C0), xvec2, xvec2; +LDL_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec13, xvec13; +ADD_SX xvec1, xvec12, xvec12; +ADD_SX xvec2, xvec11, xvec11; +ADD_SX xvec3, xvec10, xvec10; +#endif +STL_SX xvec13, 0*SIZE(C0); +STL_SX xvec12, 2*SIZE(C0); +STL_SX xvec11, 4*SIZE(C0); +STL_SX xvec10, 6*SIZE(C0); +#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 8), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL)&&defined(LEFT) +ADDQ $8, kk; +#endif +ADDQ $8*SIZE, C0; +DECQ i; +JG .L41_bodyB; +ALIGN_4 +.L41_loopE: +TEST $4, bm; +JLE .L42_loopE; +ALIGN_4 +.L42_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk +#endif +SARQ $2, k; +JLE .L421_loopE; +ALIGN_4 +.L421_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +BROAD_SX 0*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; + +LD_SX 4*SIZE(ptrba), xvec0; +BROAD_SX 1*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; + +LD_SX 8*SIZE(ptrba), xvec0; +BROAD_SX 2*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; + +LD_SX 12*SIZE(ptrba), xvec0; +BROAD_SX 3*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; +ADDQ $16*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; +DECQ k; +JG .L421_bodyB; +ALIGN_4 +.L421_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L422_loopE; +ALIGN_4 +.L422_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +BROAD_SX 0*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; + +LD_SX 4*SIZE(ptrba), xvec0; +BROAD_SX 1*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; + +ADDQ $8*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; +.L422_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L423_loopE; +ALIGN_4 +.L423_bodyB: +LD_SX 0*SIZE(ptrba), xvec0; +BROAD_SX 0*SIZE(ptrbb), xvec1; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $1*SIZE, ptrbb; + +.L423_loopE: +#### Writing back #### +BROAD_SX MEMALPHA, xvec7; +MUL_SX xvec7, xvec15, xvec15; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; +#endif +STL_SX xvec15, 0*SIZE(C0); +STH_SX xvec15, 2*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (, %rax, SIZE), %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk +#endif +ADDQ $4*SIZE, C0; + +.L42_loopE: +TEST $2, bm; +JLE .L43_loopE; +ALIGN_4 +.L43_bodyB: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax +LEAQ (, %rax, SIZE), %rax +LEAQ (ptrba, %rax, 2), ptrba +ADDQ %rax, ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +XOR_SY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L431_loopE; +ALIGN_4 +.L431_bodyB: +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; + +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; + +vmovss 4*SIZE(ptrba), xvec0; +vmovss 5*SIZE(ptrba), xvec1; +vmovss 2*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; + +vmovss 6*SIZE(ptrba), xvec3; +vmovss 7*SIZE(ptrba), xvec4; +vmovss 3*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; +addq $8*SIZE, ptrba; +addq $4*SIZE, ptrbb; +decq k; +JG .L431_bodyB; +ALIGN_4 +.L431_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L432_loopE; +ALIGN_4 +.L432_bodyB: +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; + +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; +addq $4*SIZE, ptrba; +addq $2*SIZE, ptrbb; + +.L432_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L433_loopE; +ALIGN_4 +.L433_bodyB: +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; +addq $2*SIZE, ptrba; +addq $1*SIZE, ptrbb; + +.L433_loopE: +#### Writing Back #### +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; +vmulss xvec7, xvec14, xvec14; + +#ifndef TRMMKERNEL +vaddss 0*SIZE(C0), xvec15, xvec15; +vaddss 1*SIZE(C0), xvec14, xvec14; +#endif +vmovss xvec15, 0*SIZE(C0); +vmovss xvec14, 1*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +LEAQ (ptrba, %rax, 2), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +addq $2, kk +#endif +addq $2*SIZE, C0; + +.L43_loopE: +TEST $1, bm; +JLE .L44_loopE; +ALIGN_4 +.L44_bodyB: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bb, ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +LEAQ (, %rax, SIZE), %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_SY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk, k; +#elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L441_loopE; +ALIGN_4 +.L441_bodyB: +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 2*SIZE(ptrba), xvec0; +vmovss 2*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 3*SIZE(ptrba), xvec0; +vmovss 3*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; +addq $4*SIZE, ptrba; +addq $4*SIZE, ptrbb; +decq k; +JG .L441_bodyB; +ALIGN_4 +.L441_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L442_loopE; +ALIGN_4 +.L442_bodyB: +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; +addq $2*SIZE, ptrba; +addq $2*SIZE, ptrbb; + +.L442_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L443_loopE; +ALIGN_4 +.L443_bodyB: +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; +addq $1*SIZE, ptrba; +addq $1*SIZE, ptrbb; + +.L443_loopE: +#### Writing Back #### +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; +#ifndef TRMMKERNEL +vaddss 0*SIZE(C0), xvec15, xvec15; +#endif +vmovss xvec15, 0*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +LEAQ (,%rax, SIZE), %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +addq $1, kk +#endif +addq $1*SIZE, C0; + +.L44_loopE: +MOV bk, k; +SALQ $2, k; +ADDQ k, bb; +ADDQ ldc, C; + +.L40_loopE: +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; + +vzeroupper + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index 052ff1a79..854e0f295 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -1,4 +1,3 @@ -/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -47,7 +46,7 @@ #ifndef WINDOWS_ABI -#define STACKSIZE 64 +#define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi @@ -57,6 +56,10 @@ #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) #else @@ -71,6 +74,10 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define MMM 216(%rsp) +#define NN 224(%rsp) +#define AA 232(%rsp) +#define LDAX 240(%rsp) #endif @@ -127,29 +134,48 @@ movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, X + movq X, AA + movq OLD_LDA, X + movq X, LDAX movq OLD_X, X #else - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX #endif - - movq STACK_INCX, INCX - movq STACK_Y, Y - movq STACK_INCY, INCY - movq STACK_BUFFER, BUFFER - #ifndef WINDOWS_ABI pshufd $0, %xmm0, ALPHA #else pshufd $0, %xmm3, ALPHA #endif + +.L0t: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00t + ALIGN_4 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00t: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA @@ -6341,6 +6367,12 @@ ALIGN_4 .L999: + leaq (,M,SIZE),%rax + addq %rax,AA + jmp .L0t + ALIGN_4 + +.L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 901a5ad31..0f1ebd564 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index bfe7ebd69..9dd123c52 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 2df76f1cb..93a66aaa7 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index bbba0b427..f412b3e2f 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S index 513572ee9..552dbacdc 100644 --- a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S index 526a78c57..7727fd591 100644 --- a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S index e96496fd6..699364941 100644 --- a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S index 13804e0f8..e2f153ab3 100644 --- a/kernel/x86_64/zdot_sse.S +++ b/kernel/x86_64/zdot_sse.S @@ -699,7 +699,7 @@ movsd -32 * SIZE(X), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0x59, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1336,7 +1336,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1697,7 +1697,7 @@ movsd -32 * SIZE(Y), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0xa9, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2024,7 +2024,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S new file mode 100644 index 000000000..9f6fb8a5f --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -0,0 +1,3239 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define old_bm %rdi +#define old_bn %rsi +#define old_bk %rdx + +#define bm %r13 +#define bn %r14 +#define bk %r15 + +#define ALPHA %xmm0 +#define ba %rcx +#define bb %r8 +#define C %r9 +#define ldc %r10 + +#define i %r11 +#define k %rax + +#define ptrba %rdi +#define ptrbb %rsi +#define C0 %rbx +#define C1 %rbp + +#define prebb %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define old_ldc 8+STACKSIZE(%rsp) +#define old_offset 16+STACKSIZE(%rsp) + +#define MEMALPHA_R 48(%rsp) +#define MEMALPHA_I 56(%rsp) +#define j 64(%rsp) +#define OFFSET 72(%rsp) +#define kk 80(%rsp) +#define kkk 88(%rsp) + +#else +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define old_ldc 72 + STACKSIZE(%rsp) +#define old_offset 80 + STACKSIZE(%rsp) + +#define MEMALPHA_R 224(%rsp) +#define MEMALPHA_I 232(%rsp) +#define j 240(%rsp) +#define OFFSET 248(%rsp) +#define kk 256(%rsp) +#define kkk 264(%rsp) + +#endif + +#define PREFETCH0 prefetcht0 +#define PREFETCH1 prefetcht0 +#define PREFETCH2 prefetcht0 +#define PRESIZE 64 + +#define xvec0 %xmm0 +#define xvec1 %xmm1 +#define xvec2 %xmm2 +#define xvec3 %xmm3 +#define xvec4 %xmm4 +#define xvec5 %xmm5 +#define xvec6 %xmm6 +#define xvec7 %xmm7 +#define xvec8 %xmm8 +#define xvec9 %xmm9 +#define xvec10 %xmm10 +#define xvec11 %xmm11 +#define xvec12 %xmm12 +#define xvec13 %xmm13 +#define xvec14 %xmm14 +#define xvec15 %xmm15 + +#define yvec0 %ymm0 +#define yvec1 %ymm1 +#define yvec2 %ymm2 +#define yvec3 %ymm3 +#define yvec4 %ymm4 +#define yvec5 %ymm5 +#define yvec6 %ymm6 +#define yvec7 %ymm7 +#define yvec8 %ymm8 +#define yvec9 %ymm9 +#define yvec10 %ymm10 +#define yvec11 %ymm11 +#define yvec12 %ymm12 +#define yvec13 %ymm13 +#define yvec14 %ymm14 +#define yvec15 %ymm15 + +#define LEAQ leaq +#define ADDQ addq +#define MULQ imulq +#define SARQ sarq +#define SALQ salq +#define ANDQ andq +#define SUBQ subq +#define DECQ decq +#define JG jg +#define JLE jle +#define TEST testq +#define OR orq +#define JNE jne +#define JMP jmp +#define NOP +#define XOR xorpd +#undef MOVQ +#define MOVQ movq + +#define XOR_DY vxorpd +#define XOR_DX vxorpd + +#define LD_DY vmovapd +#define LD_DX vmovapd +#define LDL_DY vmovlpd +#define LDL_DX vmovlpd +#define LDH_DY vmovhpd +#define LDH_DX vmovhpd + +#define ST_DY vmovapd +#define ST_DX vmovapd +#define STL_DY vmovlpd +#define STL_DX vmovlpd +#define STH_DY vmovhpd +#define STH_DX vmovhpd + +#define EDUP_DY vmovddup + +#define ADD_DY vaddpd +#define ADD_DX vaddpd +#define SUB_DY vsubpd +#define SUB_DX vsubpd + +#define ADDSUB_DY vaddsubpd +#define ADDSUB_DX vaddsubpd + +#define MUL_DY vmulpd +#define MUL_DX vmulpd + +#define SHUF_DY vperm2f128 +#define SHUF_DX vpshufd + +#define VPERMILP_DY vpermilpd + +#define BROAD_DY vbroadcastsd +#define BROAD_DX vmovddup + +#define MOV_DY vmovapd +#define MOV_DX vmovapd + +#define REVS_DY vshufpd +#define REVS_DX vmovsd + +#define EXTRA_DY vextractf128 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1_DX ADD_DX +#define ADD1_DY ADD_DY +#define ADD2_DY ADDSUB_DY +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1_DX SUB_DX +#define ADD1_DY SUB_DY +#define ADD2_DY ADDSUB_DY +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1_DX SUB_DX +#define ADD1_DY SUB_DY +#define ADD2_DY ADDSUB_DY +#else +#define ADD1_DX ADD_DX +#define ADD1_DY ADD_DY +#define ADD2_DY ADDSUB_DY +#endif + +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, old_bm + movq ARG2, old_bn + movq ARG3, old_bk + movq OLD_A, ba + movq OLD_B, bb + movq OLD_C, C + movq old_ldc, ldc +#ifdef TRMMKERNEL + movq old_offset, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + +movq old_ldc, ldc +#ifdef TRMMKERNEL +movq old_offset, %r11; +#endif +#endif + +vzeroupper + +vmovlps %xmm0, MEMALPHA_R +vmovlps %xmm1, MEMALPHA_I +movq old_bm, bm +movq old_bn, bn +movq old_bk, bk +salq $ZBASE_SHIFT, ldc +#ifdef TRMMKERNEL +movq %r11, OFFSET +#ifndef LEFT +negq %r11; +#endif +movq %r11, kk; +#endif + +MOVQ bn,j; +SARQ $2,j; # Rn = 4 +JLE .L0_loopE; +ALIGN_5; +.L0_bodyB:; +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ C,C0; +LEAQ (C,ldc,2),C1; +MOVQ bk, k; +SALQ $6, k; +LEAQ (bb, k, 1), prebb; # Rn=4 SIZE=8 COMPLEX=2 +MOVQ ba,ptrba; +MOVQ bm,i; +SARQ $2,i; # Rm = 4 +JLE .L1_loopE; +ALIGN_5; +.L1_bodyB:; +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif + +PREFETCH0 0*SIZE(prebb); +PREFETCH0 8*SIZE(prebb); +PREFETCH0 16*SIZE(prebb) +ADDQ $24*SIZE, prebb; +# Initial Results Register +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +EDUP_DY 0*SIZE(ptrbb), yvec2; # Br1, Br1, Br2, Br2 +XOR_DY yvec13, yvec13, yvec13; +XOR_DY yvec12, yvec12, yvec12; +EDUP_DY 4*SIZE(ptrbb), yvec3; # Br3, Br3, Br4, Br4 +PREFETCH2 3*SIZE(C0); +PREFETCH2 3*SIZE(C1); +XOR_DY yvec11, yvec11, yvec11; +XOR_DY yvec10, yvec10, yvec10; +LD_DY 0*SIZE(ptrba), yvec0; # Ar1, Ai1, Ar2, Ai2 +PREFETCH2 7*SIZE(C0, ldc, 1); +PREFETCH2 7*SIZE(C1, ldc, 1); +XOR_DY yvec9, yvec9, yvec9; +XOR_DY yvec8, yvec8, yvec8; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2,k; # Unroll 4 times +JLE .L2_loopE; +ALIGN_5; +.L2_bodyB:; +#### Computing kernel #### + +#### Unroll time 1 #### +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1 +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 +ADD1_DY yvec6, yvec15, yvec15; +ADD1_DY yvec7, yvec11, yvec11; + +PREFETCH0 PRESIZE*SIZE(ptrba); +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 +ADD1_DY yvec6, yvec14, yvec14; +ADD1_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 +ADD1_DY yvec6, yvec13, yvec13; +ADD1_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 +MUL_DY yvec1, yvec5, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 +ADD1_DY yvec6, yvec12, yvec12; +ADD1_DY yvec7, yvec8, yvec8; + +VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 +MUL_DY yvec0, yvec2, yvec6; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec6, yvec15, yvec15; +ADD2_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 8*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 12*SIZE(ptrbb), yvec3; +ADD2_DY yvec6, yvec14, yvec14; +ADD2_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 8*SIZE(ptrba), yvec0; +ADD2_DY yvec6, yvec13, yvec13; +ADD2_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec6, yvec12, yvec12; +ADD2_DY yvec7, yvec8, yvec8; + +#### Unroll time 2 #### +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 +ADD1_DY yvec6, yvec15, yvec15; +ADD1_DY yvec7, yvec11, yvec11; + +PREFETCH0 (PRESIZE+8)*SIZE(ptrba); +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 +ADD1_DY yvec6, yvec14, yvec14; +ADD1_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 +ADD1_DY yvec6, yvec13, yvec13; +ADD1_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 +MUL_DY yvec1, yvec5, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 +ADD1_DY yvec6, yvec12, yvec12; +ADD1_DY yvec7, yvec8, yvec8; + +VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 +MUL_DY yvec0, yvec2, yvec6; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec6, yvec15, yvec15; +ADD2_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 16*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 20*SIZE(ptrbb), yvec3; +ADD2_DY yvec6, yvec14, yvec14; +ADD2_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 16*SIZE(ptrba), yvec0; +ADD2_DY yvec6, yvec13, yvec13; +ADD2_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec6, yvec12, yvec12; +ADD2_DY yvec7, yvec8, yvec8; + +#### Unroll time 3 #### +LD_DY 20*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 +ADD1_DY yvec6, yvec15, yvec15; +ADD1_DY yvec7, yvec11, yvec11; + +PREFETCH0 (PRESIZE+16)*SIZE(ptrba); +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 17*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 21*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 +ADD1_DY yvec6, yvec14, yvec14; +ADD1_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 +ADD1_DY yvec6, yvec13, yvec13; +ADD1_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 +MUL_DY yvec1, yvec5, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 +ADD1_DY yvec6, yvec12, yvec12; +ADD1_DY yvec7, yvec8, yvec8; + +VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 +MUL_DY yvec0, yvec2, yvec6; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec6, yvec15, yvec15; +ADD2_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 24*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 28*SIZE(ptrbb), yvec3; +ADD2_DY yvec6, yvec14, yvec14; +ADD2_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 24*SIZE(ptrba), yvec0; +ADD2_DY yvec6, yvec13, yvec13; +ADD2_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec6, yvec12, yvec12; +ADD2_DY yvec7, yvec8, yvec8; + +#### Unroll time 4 #### +LD_DY 28*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 +ADDQ $32*SIZE, ptrba; +ADD1_DY yvec6, yvec15, yvec15; +ADD1_DY yvec7, yvec11, yvec11; + +PREFETCH0 (PRESIZE+24)*SIZE(ptrba); +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 25*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 29*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 +ADD1_DY yvec6, yvec14, yvec14; +ADD1_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 +ADDQ $32*SIZE, ptrbb; +ADD1_DY yvec6, yvec13, yvec13; +ADD1_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 +MUL_DY yvec1, yvec5, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 +ADD1_DY yvec6, yvec12, yvec12; +ADD1_DY yvec7, yvec8, yvec8; + +VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 +MUL_DY yvec0, yvec2, yvec6; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec6, yvec15, yvec15; +ADD2_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 4*SIZE(ptrbb), yvec3; +ADD2_DY yvec6, yvec14, yvec14; +ADD2_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 0*SIZE(ptrba), yvec0; +ADD2_DY yvec6, yvec13, yvec13; +ADD2_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec6, yvec12, yvec12; +ADD2_DY yvec7, yvec8, yvec8; +DECQ k; +JG .L2_bodyB; +ALIGN_5 +.L2_loopE:; +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L3_loopE; +ALIGN_5 +.L3_bodyB: +#### Unroll time 1 #### +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1 +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 +ADD1_DY yvec6, yvec15, yvec15; +ADD1_DY yvec7, yvec11, yvec11; + +PREFETCH0 PRESIZE*SIZE(ptrba); +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 +ADD1_DY yvec6, yvec14, yvec14; +ADD1_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 +ADD1_DY yvec6, yvec13, yvec13; +ADD1_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 +MUL_DY yvec1, yvec5, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 +ADD1_DY yvec6, yvec12, yvec12; +ADD1_DY yvec7, yvec8, yvec8; + +VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 +MUL_DY yvec0, yvec2, yvec6; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec6, yvec15, yvec15; +ADD2_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 8*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 12*SIZE(ptrbb), yvec3; +ADD2_DY yvec6, yvec14, yvec14; +ADD2_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 8*SIZE(ptrba), yvec0; +ADD2_DY yvec6, yvec13, yvec13; +ADD2_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec6, yvec12, yvec12; +ADD2_DY yvec7, yvec8, yvec8; + +#### Unroll time 2 #### +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 +ADDQ $16*SIZE, ptrba +ADD1_DY yvec6, yvec15, yvec15; +ADD1_DY yvec7, yvec11, yvec11; + +PREFETCH0 (PRESIZE+8)*SIZE(ptrba); +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 +ADD1_DY yvec6, yvec14, yvec14; +ADD1_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 +ADDQ $16*SIZE, ptrbb +ADD1_DY yvec6, yvec13, yvec13; +ADD1_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 +MUL_DY yvec1, yvec5, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 +ADD1_DY yvec6, yvec12, yvec12; +ADD1_DY yvec7, yvec8, yvec8; + +VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 +MUL_DY yvec0, yvec2, yvec6; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec6, yvec15, yvec15; +ADD2_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 4*SIZE(ptrbb), yvec3; +ADD2_DY yvec6, yvec14, yvec14; +ADD2_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +LD_DY 0*SIZE(ptrba), yvec0; +ADD2_DY yvec6, yvec13, yvec13; +ADD2_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec6, yvec12, yvec12; +ADD2_DY yvec7, yvec8, yvec8; +.L3_loopE:; +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L4_loopE; +ALIGN_5 +.L4_loopB:; +#### Unroll time 1 #### +PREFETCH0 PRESIZE*SIZE(ptrba); +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec0, yvec2, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 +ADDQ $8*SIZE, ptrba; +ADD1_DY yvec6, yvec15, yvec15; +ADD1_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 +MUL_DY yvec1, yvec3, yvec7; +EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 +ADD1_DY yvec6, yvec14, yvec14; +ADD1_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 +ADDQ $8*SIZE, ptrbb; +ADD1_DY yvec6, yvec13, yvec13; +ADD1_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 +MUL_DY yvec1, yvec5, yvec7; +SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 +ADD1_DY yvec6, yvec12, yvec12; +ADD1_DY yvec7, yvec8, yvec8; + +VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 +MUL_DY yvec0, yvec2, yvec6; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec6, yvec15, yvec15; +ADD2_DY yvec7, yvec11, yvec11; + +MUL_DY yvec1, yvec2, yvec6; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec6, yvec14, yvec14; +ADD2_DY yvec7, yvec10, yvec10; + +MUL_DY yvec0, yvec4, yvec6; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec6, yvec13, yvec13; +ADD2_DY yvec7, yvec9, yvec9; + +MUL_DY yvec1, yvec4, yvec6; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec6, yvec12, yvec12; +ADD2_DY yvec7, yvec8, yvec8; +.L4_loopE:; +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +ADDSUB_DY yvec13, yvec7, yvec13; +ADDSUB_DY yvec12, yvec7, yvec12; +ADDSUB_DY yvec11, yvec7, yvec11; +ADDSUB_DY yvec10, yvec7, yvec10; +ADDSUB_DY yvec9, yvec7, yvec9; +ADDSUB_DY yvec8, yvec7, yvec8; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +SUB_DY yvec14, yvec7, yvec14; +SUB_DY yvec13, yvec7, yvec13; +SUB_DY yvec12, yvec7, yvec12; +SUB_DY yvec11, yvec7, yvec11; +SUB_DY yvec10, yvec7, yvec10; +SUB_DY yvec9, yvec7, yvec9; +SUB_DY yvec8, yvec7, yvec8; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec13; +VPERMILP_DY $0x05, yvec12, yvec12; +VPERMILP_DY $0x05, yvec11, yvec11; +VPERMILP_DY $0x05, yvec10, yvec10; +VPERMILP_DY $0x05, yvec9, yvec9; +VPERMILP_DY $0x05, yvec8, yvec8; +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +ADDSUB_DY yvec13, yvec7, yvec13; +ADDSUB_DY yvec12, yvec7, yvec12; +ADDSUB_DY yvec11, yvec7, yvec11; +ADDSUB_DY yvec10, yvec7, yvec10; +ADDSUB_DY yvec9, yvec7, yvec9; +ADDSUB_DY yvec8, yvec7, yvec8; +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec13; +VPERMILP_DY $0x05, yvec12, yvec12; +VPERMILP_DY $0x05, yvec11, yvec11; +VPERMILP_DY $0x05, yvec10, yvec10; +VPERMILP_DY $0x05, yvec9, yvec9; +VPERMILP_DY $0x05, yvec8, yvec8; +#endif +#### Load Alpha #### +BROAD_DY MEMALPHA_R,yvec7; +BROAD_DY MEMALPHA_I,yvec6; +#### Multiply Alpha #### +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADDSUB_DY yvec5, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec4; +MUL_DY yvec7, yvec14, yvec14; +MUL_DY yvec6, yvec4, yvec4; +ADDSUB_DY yvec4, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec3; +MUL_DY yvec7, yvec13, yvec13; +MUL_DY yvec6, yvec3, yvec3; +ADDSUB_DY yvec3, yvec13, yvec13; +VPERMILP_DY $0x05,yvec12, yvec2; +MUL_DY yvec7, yvec12, yvec12; +MUL_DY yvec6, yvec2, yvec2; +ADDSUB_DY yvec2, yvec12, yvec12; +VPERMILP_DY $0x05, yvec11, yvec1; +MUL_DY yvec7, yvec11, yvec11; +MUL_DY yvec6, yvec1, yvec1; +ADDSUB_DY yvec1, yvec11, yvec11; +VPERMILP_DY $0x05,yvec10, yvec0; +MUL_DY yvec7, yvec10, yvec10; +MUL_DY yvec6, yvec0, yvec0; +ADDSUB_DY yvec0, yvec10, yvec10; +VPERMILP_DY $0x05, yvec9, yvec5; +MUL_DY yvec7, yvec9, yvec9; +MUL_DY yvec6, yvec5, yvec5; +ADDSUB_DY yvec5, yvec9, yvec9; +VPERMILP_DY $0x05, yvec8, yvec4; +MUL_DY yvec7, yvec8, yvec8; +MUL_DY yvec6, yvec4, yvec4; +ADDSUB_DY yvec4, yvec8, yvec8; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L4_loopEx; +ALIGN_5 +#### Store Back #### +EXTRA_DY $1,yvec15,xvec7; +EXTRA_DY $1,yvec14,xvec6; +EXTRA_DY $1,yvec13,xvec5; +EXTRA_DY $1,yvec12,xvec4; +EXTRA_DY $1,yvec11,xvec3; +EXTRA_DY $1,yvec10,xvec2; +EXTRA_DY $1,yvec9,xvec1; +EXTRA_DY $1,yvec8,xvec0; +#ifndef TRMMKERNEL +ADD_DY 0*SIZE(C0),xvec15, xvec15; +ADD_DY 2*SIZE(C0,ldc,1), xvec7, xvec7; +ADD_DY 4*SIZE(C0),xvec14, xvec14; +ADD_DY 6*SIZE(C0,ldc,1),xvec6, xvec6; +ADD_DY 0*SIZE(C0,ldc,1),xvec13, xvec13; +ADD_DY 2*SIZE(C0),xvec5, xvec5; +ADD_DY 4*SIZE(C0,ldc,1),xvec12, xvec12; +ADD_DY 6*SIZE(C0),xvec4, xvec4; +ADD_DY 0*SIZE(C1),xvec11, xvec11; +ADD_DY 2*SIZE(C1,ldc,1),xvec3, xvec3; +ADD_DY 4*SIZE(C1),xvec10, xvec10; +ADD_DY 6*SIZE(C1,ldc,1),xvec2, xvec2; +ADD_DY 0*SIZE(C1,ldc,1),xvec9, xvec9; +ADD_DY 2*SIZE(C1),xvec1, xvec1; +ADD_DY 4*SIZE(C1,ldc,1),xvec8, xvec8; +ADD_DY 6*SIZE(C1),xvec0, xvec0; +#endif +ST_DY xvec15,0*SIZE(C0); +ST_DY xvec7,2*SIZE(C0,ldc,1); +ST_DY xvec14,4*SIZE(C0); +ST_DY xvec6,6*SIZE(C0,ldc,1); +ST_DY xvec13,0*SIZE(C0,ldc,1); +ST_DY xvec5,2*SIZE(C0); +ST_DY xvec12,4*SIZE(C0,ldc,1); +ST_DY xvec4,6*SIZE(C0); +ST_DY xvec11,0*SIZE(C1); +ST_DY xvec3,2*SIZE(C1,ldc,1); +ST_DY xvec10,4*SIZE(C1); +ST_DY xvec2,6*SIZE(C1,ldc,1); +ST_DY xvec9,0*SIZE(C1,ldc,1); +ST_DY xvec1,2*SIZE(C1); +ST_DY xvec8,4*SIZE(C1,ldc,1); +ST_DY xvec0,6*SIZE(C1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif +ADDQ $8*SIZE,C0; +ADDQ $8*SIZE,C1; +.L1_bodyE:; +DECQ i; +JG .L1_bodyB; +JMP .L1_loopE; +ALIGN_5 +.L4_loopEx: +EXTRA_DY $1, yvec15, xvec7; +EXTRA_DY $1, yvec14, xvec6; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C0), xvec0, xvec0; +LDH_DY 1*SIZE(C0), xvec0, xvec0; +LDL_DY 2*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DY 3*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DY 4*SIZE(C0), xvec2, xvec2; +LDH_DY 5*SIZE(C0), xvec2, xvec2; +LDL_DY 6*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_DY 7*SIZE(C0, ldc, 1), xvec3, xvec3; +ADD_DY xvec0, xvec15, xvec15; +ADD_DY xvec1, xvec7, xvec7; +ADD_DY xvec2, xvec14, xvec14; +ADD_DY xvec3, xvec6, xvec6; +#endif +STL_DY xvec15, 0*SIZE(C0); +STH_DY xvec15, 1*SIZE(C0); +STL_DY xvec7, 2*SIZE(C0, ldc, 1); +STH_DY xvec7, 3*SIZE(C0, ldc, 1); +STL_DY xvec14, 4*SIZE(C0); +STH_DY xvec14, 5*SIZE(C0); +STL_DY xvec6, 6*SIZE(C0, ldc, 1); +STH_DY xvec6, 7*SIZE(C0, ldc, 1); +EXTRA_DY $1, yvec13, xvec5; +EXTRA_DY $1, yvec12, xvec4; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_DY 1*SIZE(C0, ldc, 1), xvec3, xvec3; +LDL_DY 2*SIZE(C0), xvec2, xvec2; +LDH_DY 3*SIZE(C0), xvec2, xvec2; +LDL_DY 4*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DY 5*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DY 6*SIZE(C0), xvec0, xvec0; +LDH_DY 7*SIZE(C0), xvec0, xvec0; +ADD_DY xvec3, xvec13, xvec13; +ADD_DY xvec2, xvec5, xvec5; +ADD_DY xvec1, xvec12, xvec12; +ADD_DY xvec0, xvec4, xvec4; +#endif +STL_DY xvec13, 0*SIZE(C0, ldc, 1); +STH_DY xvec13, 1*SIZE(C0, ldc, 1); +STL_DY xvec5, 2*SIZE(C0); +STH_DY xvec5, 3*SIZE(C0); +STL_DY xvec12, 4*SIZE(C0, ldc, 1); +STH_DY xvec12, 5*SIZE(C0, ldc, 1); +STL_DY xvec4, 6*SIZE(C0); +STH_DY xvec4, 7*SIZE(C0); +EXTRA_DY $1, yvec11, xvec3; +EXTRA_DY $1, yvec10, xvec2; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C1), xvec7, xvec7; +LDH_DY 1*SIZE(C1), xvec7, xvec7; +LDL_DY 2*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_DY 3*SIZE(C1, ldc, 1), xvec6, xvec6; +LDL_DY 4*SIZE(C1), xvec5, xvec5; +LDH_DY 5*SIZE(C1), xvec5, xvec5; +LDL_DY 6*SIZE(C1, ldc, 1), xvec4, xvec4; +LDH_DY 7*SIZE(C1, ldc, 1), xvec4, xvec4; +ADD_DY xvec7, xvec11, xvec11; +ADD_DY xvec6, xvec3, xvec3; +ADD_DY xvec5, xvec10, xvec10; +ADD_DY xvec4, xvec2, xvec2; +#endif +STL_DY xvec11, 0*SIZE(C1); +STH_DY xvec11, 1*SIZE(C1); +STL_DY xvec3, 2*SIZE(C1, ldc, 1); +STH_DY xvec3, 3*SIZE(C1, ldc, 1); +STL_DY xvec10, 4*SIZE(C1); +STH_DY xvec10, 5*SIZE(C1); +STL_DY xvec2, 6*SIZE(C1, ldc, 1); +STH_DY xvec2, 7*SIZE(C1, ldc, 1); +EXTRA_DY $1, yvec9, xvec1; +EXTRA_DY $1, yvec8, xvec0; +#ifndef TRMMKERNEL +LDL_DY 0*SIZE(C1, ldc, 1), xvec5, xvec5; +LDH_DY 1*SIZE(C1, ldc, 1), xvec5, xvec5; +LDL_DY 2*SIZE(C1), xvec4, xvec4; +LDH_DY 3*SIZE(C1), xvec4, xvec4; +LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3; +LDL_DY 6*SIZE(C1), xvec2, xvec2; +LDH_DY 7*SIZE(C1), xvec2, xvec2; +ADD_DY xvec5, xvec9, xvec9; +ADD_DY xvec4, xvec1, xvec1; +ADD_DY xvec3, xvec8, xvec8; +ADD_DY xvec2, xvec0, xvec0; +#endif +STL_DY xvec9, 0*SIZE(C1, ldc, 1); +STH_DY xvec9, 1*SIZE(C1, ldc, 1); +STL_DY xvec1, 2*SIZE(C1); +STH_DY xvec1, 3*SIZE(C1); +STL_DY xvec8, 4*SIZE(C1, ldc, 1); +STH_DY xvec8, 5*SIZE(C1, ldc, 1); +STL_DY xvec0, 6*SIZE(C1); +STH_DY xvec0, 7*SIZE(C1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L1_bodyB; +ALIGN_5; +.L1_loopE:; +TEST $2, bm; +JLE .L5_loopE; +ALIGN_5 +.L5_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +XOR_DY yvec13, yvec13, yvec13; +XOR_DY yvec12, yvec12, yvec12; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L7_loopE; +ALIGN_5 +.L7_bodyB: +#### Compute kernel #### +#### Unroll times 1 #### +LD_DY 0*SIZE(ptrba), yvec0; +EDUP_DY 0*SIZE(ptrbb), yvec2; +EDUP_DY 4*SIZE(ptrbb), yvec3; + +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +EDUP_DY 1*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec5, yvec7; +ADD1_DY yvec7 ,yvec12, yvec12; +EDUP_DY 5*SIZE(ptrbb), yvec3 + +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 2 #### +LD_DY 4*SIZE(ptrba), yvec0; +EDUP_DY 8*SIZE(ptrbb), yvec2; +EDUP_DY 12*SIZE(ptrbb), yvec3; + +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +EDUP_DY 9*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec5, yvec7; +ADD1_DY yvec7 ,yvec12, yvec12; +EDUP_DY 13*SIZE(ptrbb), yvec3 + +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 3 #### +LD_DY 8*SIZE(ptrba), yvec0; +EDUP_DY 16*SIZE(ptrbb), yvec2; +EDUP_DY 20*SIZE(ptrbb), yvec3; + +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +EDUP_DY 17*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec5, yvec7; +ADD1_DY yvec7 ,yvec12, yvec12; +EDUP_DY 21*SIZE(ptrbb), yvec3 + +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 4 #### +LD_DY 12*SIZE(ptrba), yvec0; +EDUP_DY 24*SIZE(ptrbb), yvec2; +EDUP_DY 28*SIZE(ptrbb), yvec3; + +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +EDUP_DY 25*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec5, yvec7; +ADD1_DY yvec7 ,yvec12, yvec12; +EDUP_DY 29*SIZE(ptrbb), yvec3 + +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +ADDQ $16*SIZE, ptrba; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; +ADDQ $32*SIZE, ptrbb; +DECQ k; +JG .L7_bodyB; +ALIGN_5 +.L7_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L8_loopE; +ALIGN_5 +.L8_bodyB: +#### Unroll times 1 #### +LD_DY 0*SIZE(ptrba), yvec0; +EDUP_DY 0*SIZE(ptrbb), yvec2; +EDUP_DY 4*SIZE(ptrbb), yvec3; + +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +EDUP_DY 1*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec5, yvec7; +ADD1_DY yvec7 ,yvec12, yvec12; +EDUP_DY 5*SIZE(ptrbb), yvec3 + +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 2 #### +LD_DY 4*SIZE(ptrba), yvec0; +EDUP_DY 8*SIZE(ptrbb), yvec2; +EDUP_DY 12*SIZE(ptrbb), yvec3; + +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +EDUP_DY 9*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec5, yvec7; +ADD1_DY yvec7 ,yvec12, yvec12; +EDUP_DY 13*SIZE(ptrbb), yvec3 + +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +ADDQ $8*SIZE, ptrba; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; +ADDQ $16*SIZE, ptrbb; +.L8_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L9_loopE; +ALIGN_5 +.L9_bodyB: +#### Unroll times 1 #### +LD_DY 0*SIZE(ptrba), yvec0; +EDUP_DY 0*SIZE(ptrbb), yvec2; +EDUP_DY 4*SIZE(ptrbb), yvec3; + +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +EDUP_DY 1*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec5, yvec7; +ADD1_DY yvec7 ,yvec12, yvec12; +EDUP_DY 5*SIZE(ptrbb), yvec3 + +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec4, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec0, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; +ADDQ $4*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L9_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +ADDSUB_DY yvec13, yvec7, yvec13; +ADDSUB_DY yvec12, yvec7, yvec12; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +SUB_DY yvec14, yvec7, yvec14; +SUB_DY yvec13, yvec7, yvec13; +SUB_DY yvec12, yvec7, yvec12; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec13; +VPERMILP_DY $0x05, yvec12, yvec12; +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +ADDSUB_DY yvec13, yvec7, yvec13; +ADDSUB_DY yvec12, yvec7, yvec12; +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec13; +VPERMILP_DY $0x05, yvec12, yvec12; +#endif +#### Load Alpha #### +BROAD_DY MEMALPHA_R, yvec7; +BROAD_DY MEMALPHA_I, yvec6; +#### Multiply Alpha #### +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADD2_DY yvec5, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec4; +MUL_DY yvec7, yvec14, yvec14; +MUL_DY yvec6, yvec4, yvec4; +ADD2_DY yvec4, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec3; +MUL_DY yvec7, yvec13, yvec13; +MUL_DY yvec6, yvec3, yvec3; +ADD2_DY yvec3, yvec13, yvec13; +VPERMILP_DY $0x05,yvec12, yvec2; +MUL_DY yvec7, yvec12, yvec12; +MUL_DY yvec6, yvec2, yvec2; +ADD2_DY yvec2, yvec12, yvec12; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L9_loopEx; +ALIGN_5 +#### Writing back #### +EXTRA_DY $1, yvec15, xvec7; +EXTRA_DY $1, yvec14, xvec6; +EXTRA_DY $1, yvec13, xvec5; +EXTRA_DY $1, yvec12, xvec4; +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec14, xvec14; +ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12; +ADD_DX 2*SIZE(C1), xvec4, xvec4; +#endif +ST_DX xvec15, 0*SIZE(C0); +ST_DX xvec7, 2*SIZE(C0, ldc, 1); +ST_DX xvec13, 0*SIZE(C0, ldc, 1); +ST_DX xvec5, 2*SIZE(C0); +ST_DX xvec14, 0*SIZE(C1); +ST_DX xvec6, 2*SIZE(C1, ldc, 1); +ST_DX xvec12, 0*SIZE(C1, ldc, 1); +ST_DX xvec4, 2*SIZE(C1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +JMP .L5_loopE; +ALIGN_5 +.L9_loopEx: +EXTRA_DY $1, yvec15, xvec7; +EXTRA_DY $1, yvec14, xvec6; +EXTRA_DY $1, yvec13, xvec5; +EXTRA_DY $2, yvec12, xvec4; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec7, 2*SIZE(C0, ldc, 1); +STH_DX xvec7, 3*SIZE(C0, ldc, 1); +STL_DX xvec13, 0*SIZE(C0, ldc, 1); +STH_DX xvec13, 1*SIZE(C0, ldc, 1); +STL_DX xvec6, 2*SIZE(C0); +STH_DX xvec6, 3*SIZE(C0); +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C1), xvec0, xvec0; +LDH_DX 1*SIZE(C1), xvec0, xvec0; +LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec3, xvec3; +LDH_DX 3*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec14, xvec14; +ADD_DX xvec1, xvec6, xvec6; +ADD_DX xvec2, xvec12, xvec12; +ADD_DX xvec3, xvec4, xvec4; +#endif +STL_DX xvec14, 0*SIZE(C1); +STH_DX xvec14, 1*SIZE(C1); +STL_DX xvec6, 2*SIZE(C1, ldc, 1); +STH_DX xvec6, 3*SIZE(C1, ldc, 1); +STL_DX xvec12, 0*SIZE(C1, ldc, 1); +STH_DX xvec12, 1*SIZE(C1, ldc, 1); +STL_DX xvec4, 2*SIZE(C1); +STH_DX xvec4, 3*SIZE(C1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; +.L5_loopE: +TEST $1, bm; +JLE .L6_loopE; +ALIGN_5 +.L6_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $4, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L10_loopE; +ALIGN_5 +.L10_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i +EDUP_DY 0*SIZE(ptrbb), yvec2; +EDUP_DY 4*SIZE(ptrbb), yvec3; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +MUL_DY yvec1, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 1*SIZE(ptrbb), yvec2; +EDUP_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +MUL_DY yvec4, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +SHUF_DY $0x31, yvec0, yvec0, yvec1; +EDUP_DY 8*SIZE(ptrbb), yvec2; +EDUP_DY 12*SIZE(ptrbb), yvec3; + +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +MUL_DY yvec1, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 9*SIZE(ptrbb), yvec2; +EDUP_DY 13*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +MUL_DY yvec4, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +LD_DY 4*SIZE(ptrba), yvec0; +EDUP_DY 16*SIZE(ptrbb), yvec2; +EDUP_DY 20*SIZE(ptrbb), yvec3; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +MUL_DY yvec1, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 17*SIZE(ptrbb), yvec2; +EDUP_DY 21*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +MUL_DY yvec4, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +SHUF_DY $0x31, yvec0, yvec0, yvec1; +EDUP_DY 24*SIZE(ptrbb), yvec2; +EDUP_DY 28*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +MUL_DY yvec1, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 25*SIZE(ptrbb), yvec2; +EDUP_DY 29*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +MUL_DY yvec4, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14 +ADDQ $8*SIZE, ptrba; +ADDQ $32*SIZE, ptrbb; +DECQ k; +JG .L10_bodyB; +ALIGN_5 +.L10_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L11_loopE; +ALIGN_5 +.L11_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i +EDUP_DY 0*SIZE(ptrbb), yvec2; +EDUP_DY 4*SIZE(ptrbb), yvec3; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +MUL_DY yvec1, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 1*SIZE(ptrbb), yvec2; +EDUP_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +MUL_DY yvec4, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +SHUF_DY $0x31, yvec0, yvec0, yvec1; +EDUP_DY 8*SIZE(ptrbb), yvec2; +EDUP_DY 12*SIZE(ptrbb), yvec3; + +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +MUL_DY yvec1, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 9*SIZE(ptrbb), yvec2; +EDUP_DY 13*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +MUL_DY yvec4, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +ADDQ $4*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; + +.L11_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L12_loopE; +ALIGN_5 +.L12_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i +EDUP_DY 0*SIZE(ptrbb), yvec2; +EDUP_DY 4*SIZE(ptrbb), yvec3; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +MUL_DY yvec1, yvec3, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 1*SIZE(ptrbb), yvec2; +EDUP_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +MUL_DY yvec4, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +ADDQ $2*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L12_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +SUB_DY yvec14, yvec7, yvec14; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +#endif +#### Multiply Alpha #### +BROAD_DY MEMALPHA_R, yvec7; +BROAD_DY MEMALPHA_I, yvec6; +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADD2_DY yvec5, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec4; +MUL_DY yvec7, yvec14, yvec14; +MUL_DY yvec6, yvec4, yvec4; +ADD2_DY yvec4, yvec14, yvec14; +#### Writing Back #### +EXTRA_DY $1, yvec15, xvec7; +EXTRA_DY $1, yvec14, xvec6; +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec7, 0*SIZE(C0, ldc, 1); +STH_DX xvec7, 1*SIZE(C0, ldc, 1); +STL_DX xvec14, 0*SIZE(C1); +STH_DX xvec14, 1*SIZE(C1); +STL_DX xvec6, 0*SIZE(C1, ldc, 1); +STH_DX xvec6, 1*SIZE(C1, ldc, 1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 4), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C1; +.L6_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $4, kk; +#endif +MOVQ bk,k; +SALQ $6,k; +ADDQ k,bb; +LEAQ (C,ldc,4),C; +.L0_bodyE:; +DECQ j; +JG .L0_bodyB; +ALIGN_5; +.L0_loopE:; +TEST $2, bn; +JLE .L20_loopE; +ALIGN_5 +.L20_bodyB: +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ C, C0; +LEAQ (C, ldc, 1), C1; +MOVQ ba, ptrba; +MOVQ bm, i; +SARQ $2, i; +JLE .L21_loopE; +ALIGN_5 +.L21_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +XOR_DY yvec13, yvec13, yvec13; +XOR_DY yvec12, yvec12, yvec12; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L211_loopE; +ALIGN_5 +.L211_bodyB: +#### Unroll time 1 #### +EDUP_DY 0*SIZE(ptrbb), yvec2; +LD_DY 0*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +EDUP_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec1, yvec4, yvec7; +ADD1_DY yvec7, yvec12, yvec12; +VPERMILP_DY $0x05, yvec1, yvec1; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 2 #### +EDUP_DY 4*SIZE(ptrbb), yvec2; +LD_DY 8*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +EDUP_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec1, yvec4, yvec7; +ADD1_DY yvec7, yvec12, yvec12; +VPERMILP_DY $0x05, yvec1, yvec1; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 3 #### +EDUP_DY 8*SIZE(ptrbb), yvec2; +LD_DY 16*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +LD_DY 20*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +EDUP_DY 9*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec1, yvec4, yvec7; +ADD1_DY yvec7, yvec12, yvec12; +VPERMILP_DY $0x05, yvec1, yvec1; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 4 #### +EDUP_DY 12*SIZE(ptrbb), yvec2; +LD_DY 24*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +LD_DY 28*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +EDUP_DY 13*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec1, yvec4, yvec7; +ADD1_DY yvec7, yvec12, yvec12; +VPERMILP_DY $0x05, yvec1, yvec1; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +ADDQ $16*SIZE, ptrbb; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; +ADDQ $32*SIZE, ptrba; +DECQ k; +JG .L211_bodyB; +ALIGN_5 +.L211_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L212_loopE; +ALIGN_5 +.L212_bodyB: +#### Unroll time 1 #### +EDUP_DY 0*SIZE(ptrbb), yvec2; +LD_DY 0*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +EDUP_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec1, yvec4, yvec7; +ADD1_DY yvec7, yvec12, yvec12; +VPERMILP_DY $0x05, yvec1, yvec1; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +#### Unroll time 2 #### +EDUP_DY 4*SIZE(ptrbb), yvec2; +LD_DY 8*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +EDUP_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec1, yvec4, yvec7; +ADD1_DY yvec7, yvec12, yvec12; +VPERMILP_DY $0x05, yvec1, yvec1; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; + +ADDQ $8*SIZE, ptrbb; +ADDQ $16*SIZE, ptrba; + +.L212_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L213_loopE; +ALIGN_5 +.L213_bodyB: +#### Unroll time 1 #### +EDUP_DY 0*SIZE(ptrbb), yvec2; +LD_DY 0*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +EDUP_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; +MUL_DY yvec1, yvec4, yvec7; +ADD1_DY yvec7, yvec12, yvec12; +VPERMILP_DY $0x05, yvec1, yvec1; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +MUL_DY yvec1, yvec5, yvec7; +ADD2_DY yvec7, yvec12, yvec12; +ADDQ $4*SIZE, ptrbb; +ADDQ $8*SIZE, ptrba; + +.L213_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +ADDSUB_DY yvec13, yvec7, yvec13; +ADDSUB_DY yvec12, yvec7, yvec12; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +SUB_DY yvec14, yvec7, yvec14; +SUB_DY yvec13, yvec7, yvec13; +SUB_DY yvec12, yvec7, yvec12; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec13; +VPERMILP_DY $0x05, yvec12, yvec12; +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +ADDSUB_DY yvec13, yvec7, yvec13; +ADDSUB_DY yvec12, yvec7, yvec12; +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec13; +VPERMILP_DY $0x05, yvec12, yvec12; +#endif +#### Load Alpha #### +BROAD_DY MEMALPHA_R,yvec7; +BROAD_DY MEMALPHA_I,yvec6; +#### Multiply Alpha #### +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADD2_DY yvec5, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec4; +MUL_DY yvec7, yvec14, yvec14; +MUL_DY yvec6, yvec4, yvec4; +ADD2_DY yvec4, yvec14, yvec14; +VPERMILP_DY $0x05, yvec13, yvec3; +MUL_DY yvec7, yvec13, yvec13; +MUL_DY yvec6, yvec3, yvec3; +ADD2_DY yvec3, yvec13, yvec13; +VPERMILP_DY $0x05,yvec12, yvec2; +MUL_DY yvec7, yvec12, yvec12; +MUL_DY yvec6, yvec2, yvec2; +ADD2_DY yvec2, yvec12, yvec12; +EXTRA_DY $1, yvec15, xvec7; +EXTRA_DY $1, yvec14, xvec6; +EXTRA_DY $1, yvec13, xvec5; +EXTRA_DY $1, yvec12, xvec4; +#### Testing Alignment #### +MOVQ C0, %rax; +OR ldc, %rax; +TEST $15, %rax; +JNE .L213_loopEx; +ALIGN_5 +#### Writing back #### +#ifndef TRMMKERNEL +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C1), xvec6, xvec6; +ADD_DX 0*SIZE(C1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 4*SIZE(C1), xvec12, xvec12; +ADD_DX 6*SIZE(C0), xvec4, xvec4; +#endif +ST_DX xvec15,0*SIZE(C0); +ST_DX xvec7,2*SIZE(C1); +ST_DX xvec14,4*SIZE(C0); +ST_DX xvec6,6*SIZE(C1); +ST_DX xvec13,0*SIZE(C1); +ST_DX xvec5,2*SIZE(C0); +ST_DX xvec12,4*SIZE(C1); +ST_DX xvec4,6*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L21_bodyB; +JMP .L21_loopE; +ALIGN_5 +.L213_loopEx: +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C1), xvec3, xvec3; +LDH_DX 7*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec7, 2*SIZE(C1); +STH_DX xvec7, 3*SIZE(C1); +STL_DX xvec14, 4*SIZE(C0); +STH_DX xvec14, 5*SIZE(C0); +STL_DX xvec6, 6*SIZE(C1); +STH_DX xvec6, 7*SIZE(C1); +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C1), xvec3, xvec3; +LDH_DX 1*SIZE(C1), xvec3, xvec3; +LDL_DX 2*SIZE(C0), xvec2, xvec2; +LDH_DX 3*SIZE(C0), xvec2, xvec2; +LDL_DX 4*SIZE(C1), xvec1, xvec1; +LDH_DX 5*SIZE(C1), xvec1, xvec1; +LDL_DX 6*SIZE(C0), xvec0, xvec0; +LDH_DX 7*SIZE(C0), xvec0, xvec0; +ADD_DX xvec3, xvec13, xvec13; +ADD_DX xvec2, xvec5, xvec5; +ADD_DX xvec1, xvec12, xvec12; +ADD_DX xvec0, xvec4, xvec4; +#endif +STL_DX xvec13, 0*SIZE(C1); +STH_DX xvec13, 1*SIZE(C1); +STL_DX xvec5, 2*SIZE(C0); +STH_DX xvec5, 3*SIZE(C0); +STL_DX xvec12, 4*SIZE(C1); +STH_DX xvec12, 5*SIZE(C1); +STL_DX xvec4, 6*SIZE(C0); +STH_DX xvec4, 7*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif +ADDQ $8*SIZE, C0; +ADDQ $8*SIZE, C1; +DECQ i; +JG .L21_bodyB; +ALIGN_5 +.L21_loopE: +TEST $2, bm; +JLE .L22_loopE; +ALIGN_5 +.L22_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec13; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L221_loopE; +ALIGN_5 +.L221_bodyB: +#### Unroll time 1 #### +EDUP_DY 0*SIZE(ptrbb), yvec2; +LD_DY 0*SIZE(ptrba), yvec0; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +EDUP_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; + +#### Unroll time 2 #### +EDUP_DY 4*SIZE(ptrbb), yvec2; +LD_DY 4*SIZE(ptrba), yvec0; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +EDUP_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; + +#### Unroll time 3 #### +EDUP_DY 8*SIZE(ptrbb), yvec2; +LD_DY 8*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; + +EDUP_DY 9*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; + +#### Unroll time 4 #### +EDUP_DY 12*SIZE(ptrbb), yvec2; +LD_DY 12*SIZE(ptrba), yvec0; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec2, yvec2, yvec4; + +EDUP_DY 13*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; +ADDQ $16*SIZE, ptrbb; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +ADDQ $16*SIZE, ptrba; +DECQ k; +JG .L221_bodyB; +ALIGN_5 +.L221_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L222_loopE; +ALIGN_5 +.L222_bodyB: +#### Unroll time 1 #### +EDUP_DY 0*SIZE(ptrbb), yvec2; +LD_DY 0*SIZE(ptrba), yvec0; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +EDUP_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; + +#### Unroll time 2 #### +EDUP_DY 4*SIZE(ptrbb), yvec2; +LD_DY 4*SIZE(ptrba), yvec0; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +EDUP_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +ADDQ $8*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L222_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L223_loopE; +ALIGN_5 +.L223_bodyB: +#### Unroll time 1 #### +EDUP_DY 0*SIZE(ptrbb), yvec2; +LD_DY 0*SIZE(ptrba), yvec0; +SHUF_DY $0x03, yvec2, yvec2, yvec4; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +EDUP_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec0, yvec4, yvec6; +ADD1_DY yvec6, yvec13, yvec13; +VPERMILP_DY $0x05, yvec0, yvec0; + +MUL_DY yvec0, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +SHUF_DY $0x03, yvec3, yvec3, yvec5; + +MUL_DY yvec0, yvec5, yvec6; +ADD2_DY yvec6, yvec13, yvec13; +ADDQ $4*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L223_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec13, yvec7, yvec13; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +SUB_DY yvec13, yvec7, yvec13; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec13, yvec13; +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec13, yvec7, yvec13; +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec13, yvec13; +#endif + +#### Load Alpha #### +BROAD_DY MEMALPHA_R,yvec7; +BROAD_DY MEMALPHA_I,yvec6; +#### Multiply Alpha #### +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADD2_DY yvec5, yvec15, yvec15; +VPERMILP_DY $0x05, yvec13, yvec3; +MUL_DY yvec7, yvec13, yvec13; +MUL_DY yvec6, yvec3, yvec3; +ADD2_DY yvec3, yvec13, yvec13; +EXTRA_DY $1, yvec15, xvec7; +EXTRA_DY $1, yvec13, xvec5; +#### Write back #### +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec7, 2*SIZE(C1); +STH_DX xvec7, 3*SIZE(C1); +STL_DX xvec13, 0*SIZE(C1); +STH_DX xvec13, 1*SIZE(C1); +STL_DX xvec5, 2*SIZE(C0); +STH_DX xvec5, 3*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $4*SIZE, C0; +ADDQ $4*SIZE, C1; + +.L22_loopE: +TEST $1, bm; +JLE .L23_loopE; +ALIGN_5 +.L23_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $2, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L231_loopE; +ALIGN_5 +.L231_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i +EDUP_DY 0*SIZE(ptrbb), yvec2; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 1*SIZE(ptrbb), yvec2; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +SHUF_DY $0x31, yvec0, yvec0, yvec1; +EDUP_DY 4*SIZE(ptrbb), yvec2; + +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 5*SIZE(ptrbb), yvec2; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec0; +EDUP_DY 8*SIZE(ptrbb), yvec2; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 9*SIZE(ptrbb), yvec2; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +SHUF_DY $0x31, yvec0, yvec0, yvec1; +EDUP_DY 12*SIZE(ptrbb), yvec2; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 13*SIZE(ptrbb), yvec2; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $16*SIZE, ptrbb; +DECQ k; +JG .L231_bodyB; +ALIGN_5 +.L231_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L232_loopE; +ALIGN_5 +.L232_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i +EDUP_DY 0*SIZE(ptrbb), yvec2; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 1*SIZE(ptrbb), yvec2; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +SHUF_DY $0x31, yvec0, yvec0, yvec1; +EDUP_DY 4*SIZE(ptrbb), yvec2; + +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 5*SIZE(ptrbb), yvec2; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; + +.L232_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L233_loopE; +ALIGN_5 +.L233_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i +EDUP_DY 0*SIZE(ptrbb), yvec2; + +SHUF_DY $0x20, yvec0, yvec0, yvec1; +MUL_DY yvec1, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec4; +EDUP_DY 1*SIZE(ptrbb), yvec2; +MUL_DY yvec4, yvec2, yvec6; +ADD2_DY yvec6, yvec15, yvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L233_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +ADDSUB_DY yvec15, yvec7, yvec15; +VPERMILP_DY $0x05, yvec15, yvec15; +#endif + +#### Multiply Alpha #### +BROAD_DY MEMALPHA_R, yvec7; +BROAD_DY MEMALPHA_I, yvec6; +#### Writing Back #### +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADD2_DY yvec5, yvec15, yvec15; +EXTRA_DY $1, yvec15, xvec7; +#### Writing Back #### +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 1*SIZE(C1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec7, 0*SIZE(C1); +STH_DX xvec7, 1*SIZE(C1); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +LEAQ (ptrbb, %rax, 2), ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif +ADDQ $2*SIZE, C0; +ADDQ $2*SIZE, C0; +.L23_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $2, kk; +#endif +MOVQ bk, k; +SALQ $5, k; +ADDQ k, bb; +LEAQ (C, ldc, 2), C; +.L20_loopE: +TEST $1, bn; +JLE .L30_loopE; +ALIGN_5 +.L30_bodyB: +#if defined(TRMMKERNEL) && defined(LEFT) +MOVQ OFFSET, %rax; +MOVQ %rax, kk; +#endif +MOVQ ba, ptrba; +MOVQ C, C0; +MOVQ bm, i; +SARQ $2, i; +JLE .L31_loopE; +ALIGN_5 +.L31_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +XOR_DY yvec14, yvec14, yvec14; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $4, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L311_loopE; +ALIGN_5 +.L311_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec0, yvec4; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec5; +MUL_DY yvec5, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +LD_DY 8*SIZE(ptrba), yvec0; +BROAD_DY 2*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec0, yvec4; +BROAD_DY 3*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec5; +MUL_DY yvec5, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +LD_DY 16*SIZE(ptrba), yvec0; +BROAD_DY 4*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +LD_DY 20*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec0, yvec4; +BROAD_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec5; +MUL_DY yvec5, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +LD_DY 24*SIZE(ptrba), yvec0; +BROAD_DY 6*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +LD_DY 28*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec0, yvec4; +BROAD_DY 7*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec5; +MUL_DY yvec5, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +ADDQ $32*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L311_bodyB; +ALIGN_5 +.L311_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L312_loopE; +ALIGN_5 +.L312_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec0, yvec4; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec5; +MUL_DY yvec5, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; + +LD_DY 8*SIZE(ptrba), yvec0; +BROAD_DY 2*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +LD_DY 12*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec0, yvec4; +BROAD_DY 3*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec5; +MUL_DY yvec5, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +ADDQ $16*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L312_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L313_loopE; +ALIGN_5 +.L313_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec1; +MUL_DY yvec1, yvec2, yvec7; +ADD1_DY yvec7, yvec14, yvec14; + +VPERMILP_DY $0x05, yvec0, yvec4; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec4, yvec3, yvec6; +ADD2_DY yvec6, yvec15, yvec15; + +VPERMILP_DY $0x05, yvec1, yvec5; +MUL_DY yvec5, yvec3, yvec7; +ADD2_DY yvec7, yvec14, yvec14; +ADDQ $8*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L313_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +SUB_DY yvec14, yvec7, yvec14; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +ADDSUB_DY yvec15, yvec7, yvec15; +ADDSUB_DY yvec14, yvec7, yvec14; +VPERMILP_DY $0x05, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec14; +#endif + +#### Load Alpha #### +BROAD_DY MEMALPHA_R,yvec7; +BROAD_DY MEMALPHA_I,yvec6; +#### Multiply Alpha #### +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADD2_DY yvec5, yvec15, yvec15; +VPERMILP_DY $0x05, yvec14, yvec4; +MUL_DY yvec7, yvec14, yvec14; +MUL_DY yvec6, yvec4, yvec4; +ADD2_DY yvec4, yvec14, yvec14; +EXTRA_DY $1, yvec15, xvec7; +EXTRA_DY $1, yvec14, xvec6; +#### Writing Back #### +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec7, 2*SIZE(C0); +STH_DX xvec7, 3*SIZE(C0); +STL_DX xvec14, 4*SIZE(C0); +STH_DX xvec14, 5*SIZE(C0); +STL_DX xvec6, 6*SIZE(C0); +STH_DX xvec6, 7*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 4), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $4, kk; +#endif +ADDQ $8*SIZE, C0; +DECQ i; +JG .L31_bodyB; +ALIGN_5 +.L31_loopE: +TEST $2, bm; +JLE .L32_loopE; +ALIGN_5 +.L32_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $2, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L321_loopE; +ALIGN_5 +.L321_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +VPERMILP_DY $0x05, yvec0, yvec1; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec0; +BROAD_DY 2*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +VPERMILP_DY $0x05, yvec0, yvec1; +BROAD_DY 3*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec15, yvec15; + +LD_DY 8*SIZE(ptrba), yvec0; +BROAD_DY 4*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +VPERMILP_DY $0x05, yvec0, yvec1; +BROAD_DY 5*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec15, yvec15; + +LD_DY 12*SIZE(ptrba), yvec0; +BROAD_DY 6*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +VPERMILP_DY $0x05, yvec0, yvec1; +BROAD_DY 7*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec15, yvec15; +ADDQ $16*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L321_bodyB; +ALIGN_5 +.L321_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L322_loopE; +ALIGN_5 +.L322_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +VPERMILP_DY $0x05, yvec0, yvec1; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec15, yvec15; + +LD_DY 4*SIZE(ptrba), yvec0; +BROAD_DY 2*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +VPERMILP_DY $0x05, yvec0, yvec1; +BROAD_DY 3*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec15, yvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L322_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L323_loopE; +ALIGN_5 +.L323_bodyB: +LD_DY 0*SIZE(ptrba), yvec0; +BROAD_DY 0*SIZE(ptrbb), yvec2; +MUL_DY yvec0, yvec2, yvec6; +ADD1_DY yvec6, yvec15, yvec15; +VPERMILP_DY $0x05, yvec0, yvec1; +BROAD_DY 1*SIZE(ptrbb), yvec3; +MUL_DY yvec1, yvec3, yvec7; +ADD2_DY yvec7, yvec15, yvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; +.L323_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DY yvec15, yvec7, yvec15; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DY yvec15, yvec7, yvec15; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +VPERMILP_DY $0x05, yvec15, yvec15; +ADDSUB_DY yvec15, yvec7, yvec15; +VPERMILP_DY $0x05, yvec15, yvec15; +#endif + +#### Load Alpha #### +BROAD_DY MEMALPHA_R,yvec7; +BROAD_DY MEMALPHA_I,yvec6; +#### Multiply Alpha #### +VPERMILP_DY $0x05, yvec15, yvec5; +MUL_DY yvec7, yvec15, yvec15; +MUL_DY yvec6, yvec5, yvec5; +ADD2_DY yvec5, yvec15, yvec15; +EXTRA_DY $1, yvec15, xvec7; +#### Writing Back #### +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +STL_DX xvec7, 2*SIZE(C0); +STH_DX xvec7, 3*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +LEAQ (ptrba, %rax, 2), ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $2, kk; +#endif +ADDQ $4*SIZE, C0; +.L32_loopE: +TEST $1, bm; +JLE .L33_loopE; +ALIGN_5 +.L33_bodyB: +#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) +MOVQ bb,ptrbb; +#else +MOVQ bb, ptrbb; +MOVQ kk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +XOR_DY yvec15, yvec15, yvec15; +#ifndef TRMMKERNEL +MOVQ bk,k; +#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kk, %rax; +MOVQ %rax, kkk; +#else +MOVQ kk, %rax; +#ifdef LEFT +ADDQ $1, %rax; +#else +ADDQ $1, %rax; +#endif +MOVQ %rax, kkk; +#endif +SARQ $2, k; +JLE .L331_loopE; +ALIGN_5 +.L331_bodyB: +LD_DX 0*SIZE(ptrba), xvec0; +BROAD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; + +SHUF_DX $0x4e, xvec0, xvec1; +BROAD_DX 1*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec0; +BROAD_DX 2*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; + +SHUF_DX $0x4e, xvec0, xvec1; +BROAD_DX 3*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; + +LD_DX 4*SIZE(ptrba), xvec0; +BROAD_DX 4*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; + +SHUF_DX $0x4e, xvec0, xvec1; +BROAD_DX 5*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; + +LD_DX 6*SIZE(ptrba), xvec0; +BROAD_DX 6*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; + +SHUF_DX $0x4e, xvec0, xvec1; +BROAD_DX 7*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; +ADDQ $8*SIZE, ptrba; +ADDQ $8*SIZE, ptrbb; +DECQ k; +JG .L331_bodyB; +ALIGN_5 +.L331_loopE: +#ifndef TRMMKERNEL +TEST $2, bk; +#else +TEST $2, kkk; +#endif +JLE .L332_loopE; +ALIGN_5 +.L332_bodyB: +LD_DX 0*SIZE(ptrba), xvec0; +BROAD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; + +SHUF_DX $0x4e, xvec0, xvec1; +BROAD_DX 1*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; + +LD_DX 2*SIZE(ptrba), xvec0; +BROAD_DX 2*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; + +SHUF_DX $0x4e, xvec0, xvec1; +BROAD_DX 3*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; +ADDQ $4*SIZE, ptrba; +ADDQ $4*SIZE, ptrbb; + +.L332_loopE: +#ifndef TRMMKERNEL +TEST $1, bk; +#else +TEST $1, kkk; +#endif +JLE .L333_loopE; +ALIGN_5 +.L333_bodyB: +LD_DX 0*SIZE(ptrba), xvec0; +BROAD_DX 0*SIZE(ptrbb), xvec2; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; + +SHUF_DX $0x4e, xvec0, xvec1; +BROAD_DX 1*SIZE(ptrbb), xvec3; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; +ADDQ $2*SIZE, ptrba; +ADDQ $2*SIZE, ptrbb; + +.L333_loopE: +#### Handle #### +XOR_DY yvec7, yvec7, yvec7; +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +ADDSUB_DX xvec15, xvec7, xvec7; +MOV_DX xvec7, xvec15; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +SUB_DX xvec15, xvec7, xvec7; +MOV_DX xvec7, xvec15; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +SHUF_DX $0x4e, xvec15, xvec15; +ADDSUB_DX xvec15, xvec7, xvec7; +MOV_DX xvec7, xvec15; +SHUF_DX $0x4e, xvec15, xvec15; +#endif + +#### Load Alpha #### +BROAD_DX MEMALPHA_R,xvec7; +BROAD_DX MEMALPHA_I,xvec6; +#### Multiply Alpha #### +SHUF_DX $0x4e, xvec15, xvec5; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec6, xvec5, xvec5; +ADDSUB_DX xvec5, xvec15, xvec15; +#### Writing back #### +#ifndef TRMMKERNEL +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; +#endif +STL_DX xvec15, 0*SIZE(C0); +STH_DX xvec15, 1*SIZE(C0); +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +MOVQ bk, %rax; +SUBQ kkk, %rax; +SALQ $ZBASE_SHIFT, %rax; +ADDQ %rax, ptrba; +ADDQ %rax, ptrbb; +#endif +#if defined(TRMMKERNEL) && defined(LEFT) +ADDQ $1, kk; +#endif +ADDQ $2*SIZE, C0; +.L33_loopE: +#if defined(TRMMKERNEL) && !defined(LEFT) +ADDQ $1, kk; +#endif +MOVQ bk, k; +SALQ $4*SIZE, k; +ADDQ k, bb; +LEAQ (C, ldc, 1), C; +.L30_loopE: +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; + + +vzeroupper + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S index bf318b7ff..8876b61ff 100644 --- a/kernel/x86_64/zgemm_ncopy_2.S +++ b/kernel/x86_64/zgemm_ncopy_2.S @@ -85,7 +85,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 48 #endif diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S index eb2092dc7..393988e73 100644 --- a/kernel/x86_64/zscal_sse.S +++ b/kernel/x86_64/zscal_sse.S @@ -685,7 +685,7 @@ cmpq $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 23d2da73d..a553bbd39 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -55,7 +55,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -803,7 +803,7 @@ cmpq $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 39f0ff46f..dcfe83189 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 711907711..04605e3cb 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -167,7 +167,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 175912c71..e8b01ad7a 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define xt1 %xmm14 #define xt2 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 3e4b17030..40246e52e 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S index fb428cbf5..79f20b641 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index e9edc29ac..f5c100ec1 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S index dabc97c3e..18edeed57 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index 7375c3487..f58cecdf5 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S index 3ab9e5be8..1b589e0cf 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index 85c0ac231..2c47ce3fd 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/l1param.h b/l1param.h index f1d223ea7..0b216c7c5 100644 --- a/l1param.h +++ b/l1param.h @@ -9,6 +9,13 @@ #define ALIGNED_ACCESS #endif +#ifdef SANDYBRIDGE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 12) +#define ALIGNED_ACCESS +#endif + #ifdef ATHLON #define PREFETCH prefetch #define PREFETCHW prefetchw @@ -60,6 +67,20 @@ #define ALIGNED_ACCESS #endif +#ifdef BOBCAT +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + +#ifdef BULLDOZER +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/l2param.h b/l2param.h index af9d17179..01fe7943d 100644 --- a/l2param.h +++ b/l2param.h @@ -63,6 +63,17 @@ #define PREFETCHSIZE 64 * 3 #endif +#ifdef SANDYBRIDGE +#define MOVUPS_A movups +#define MOVUPS_XL movups +#define MOVUPS_XS movups +#define MOVUPS_YL movups +#define MOVUPS_YS movups +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw @@ -74,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 0db93da92..6f6672099 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -118,7 +118,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, @@ -245,7 +245,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { + printf("helllo\n"); LASWP_NCOPY(min_jj, off + 1, off + k, b + (- off + jjs * lda) * COMPSIZE, lda, diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index b637e6db5..4922b9b52 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -77,10 +77,21 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#if 0 LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sb + k * (jjs - js) * COMPSIZE); +#else + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE); +#endif + for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index a761dee4c..fcea0ae89 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -113,7 +113,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#if 0 +#if 1 LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index c19017631..1b0db5f8c 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -58,13 +58,34 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -85,10 +106,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + + i--; + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -172,12 +194,69 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a1 -= 2; #endif i --; - } while (i > 0); } + + //Loop Ending + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 1105aee82..8a8a89bd1 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; @@ -60,8 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif @@ -69,6 +68,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (n <= 0) return 0; j = (n >> 1); + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { do { piv = ipiv; @@ -92,10 +113,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); + i = ((rows) >> 1); - if (i > 0) { - do { + // Loop pipeline + i--; + + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -202,12 +226,99 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -240,78 +351,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index e08d49667..86ee949c4 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -66,14 +66,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { do { @@ -106,8 +127,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG i = ((k2 - k1) >> 1); - if (i > 0) { - do { + i--; //Loop pipeline + //Main Loop + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -259,12 +281,156 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a7 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -312,10 +478,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = ((rows) >> 1); + i--; + + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -409,12 +575,97 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -445,78 +696,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index a4d4bce99..e3a05dbcc 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -60,9 +60,9 @@ #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, - FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *a9, *a11, *a13, *a15; @@ -79,13 +79,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 3); if (j > 0) { do { @@ -129,50 +151,51 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b15 = b1 + 7 * lda; b16 = b2 + 7 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + i = (rows >> 1); + i--; + //Loop pipeline + //Main Loop + while (i > 0) { + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - B9 = *b9; - B10 = *b10; - B11 = *b11; - B12 = *b12; - B13 = *b13; - B14 = *b14; - B15 = *b15; - B16 = *b16; + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; - A9 = *a9; - A10 = *a10; - A11 = *a11; - A12 = *a12; - A13 = *a13; - A14 = *a14; - A15 = *a15; - A16 = *a16; + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; if (b1 == a1) { if (b2 == a1) { @@ -371,51 +394,316 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG } } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; - b9 = b1 + 4 * lda; - b10 = b2 + 4 * lda; - b11 = b1 + 5 * lda; - b12 = b2 + 5 * lda; - b13 = b1 + 6 * lda; - b14 = b2 + 6 * lda; - b15 = b1 + 7 * lda; - b16 = b2 + 7 * lda; + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; - a9 += 2; - a11 += 2; - a13 += 2; - a15 += 2; + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; #else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; - a9 -= 2; - a11 -= 2; - a13 -= 2; - a15 -= 2; + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + + *a10 = B10; + *b10 = A10; + *a12 = B12; + *b12 = A12; + *a14 = B14; + *b14 = A14; + *a16 = B16; + *b16 = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + + *a9 = A10; + *a10 = B10; + *b10 = A9; + *a11 = A12; + *a12 = B12; + *b12 = A11; + *a13 = A14; + *a14 = B14; + *b14 = A13; + *a15 = A16; + *a16 = B16; + *b16 = A15; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + + *a9 = A10; + *a10 = B9; + *b9 = A9; + *a11 = A12; + *a12 = B11; + *b11 = A11; + *a13 = A14; + *a14 = B13; + *b13 = A13; + *a15 = A16; + *a16 = B15; + *b15 = A15; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + + *a9 = B9; + *a10 = A9; + *b9 = A10; + *a11 = B11; + *a12 = A11; + *b11 = A12; + *a13 = B13; + *a14 = A13; + *b13 = A14; + *a15 = B15; + *a16 = A15; + *b15 = A16; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + + *a9 = B9; + *a10 = B10; + *b9 = A9; + *b10 = A10; + *a11 = B11; + *a12 = B12; + *b11 = A11; + *b12 = A12; + *a13 = B13; + *a14 = B14; + *b13 = A13; + *b14 = A14; + *a15 = B15; + *a16 = B16; + *b15 = A15; + *b16 = A16; + } + } + + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + b9 = b1 + 4 * lda; + b11 = b1 + 5 * lda; + b13 = b1 + 6 * lda; + b15 = b1 + 7 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -487,35 +775,205 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; + i = (rows >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + i --; + } + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; @@ -524,150 +982,120 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a6 = A5; *a7 = A8; *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; } else - if (b2 != a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; *a2 = B2; + *b1 = A1; *b2 = A2; + *a3 = B3; *a4 = B4; + *b3 = A3; *b4 = A4; + *a5 = B5; *a6 = B6; + *b5 = A5; *b6 = A6; + *a7 = B7; *a8 = B8; + *b7 = A7; *b8 = A8; } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - *a3 = A4; - *a4 = A3; - *a5 = A6; - *a6 = A5; - *a7 = A8; - *a8 = A7; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - *a3 = A4; - *a4 = B4; - *b4 = A3; - *a5 = A6; - *a6 = B6; - *b6 = A5; - *a7 = A8; - *a8 = B8; - *b8 = A7; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; - *b1 = A1; - *a3 = A4; - *a4 = B3; - *b3 = A3; - *a5 = A6; - *a6 = B5; - *b5 = A5; - *a7 = A8; - *a8 = B7; - *b7 = A7; - } else - if (b2 == a2) { - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; - *b5 = A5; - *a7 = B7; - *b7 = A7; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - *a5 = B5; - *a6 = A5; - *b5 = A6; - *a7 = B7; - *a8 = A7; - *b7 = A8; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - *a5 = B5; - *a6 = B6; - *b5 = A5; - *b6 = A6; - *a7 = B7; - *a8 = B8; - *b7 = A7; - *b8 = A8; - } - } - - b1 = a + ip1; - b2 = a + ip2; - - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; - -#ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; -#else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; -#endif - i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); - - if (i > 0) { - A1 = *a1; - B1 = *b1; - A3 = *a3; - B3 = *b3; - A5 = *a5; - B5 = *b5; - A7 = *a7; - B7 = *b7; - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; - *b5 = A5; - *a7 = B7; - *b7 = A7; - } +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + i = (rows & 1); - a += 4 * lda; + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } + + a += 4 * lda; } if (n & 2) { @@ -692,109 +1120,194 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; + i = ((rows) >> 1); + i--; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } + + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; } else - if (b2 != a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; *a2 = B2; + *b1 = A1; *b2 = A2; + *a3 = B3; *a4 = B4; + *b3 = A3; *b4 = A4; } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - *a3 = A4; - *a4 = A3; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - *a3 = A4; - *a4 = B4; - *b4 = A3; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; - *b1 = A1; - *a3 = A4; - *a4 = B3; - *b3 = A3; - } else - if (b2 == a2) { - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - } - } - - b1 = a + ip1; - b2 = a + ip2; - - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - + } #ifndef MINUS - a1 += 2; - a3 += 2; + a1 += 2; + a3 += 2; #else - a1 -= 2; - a3 -= 2; + a1 -= 2; + a3 -= 2; #endif - i --; - } while (i > 0); - } - - i = ((k2 - k1) & 1); + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -825,78 +1338,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index 3dd653baf..7a62dd9b8 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -66,6 +66,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -87,9 +119,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b2 = a + ip2; i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i --; + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -198,12 +231,98 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); + } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index a877ef66b..0fa685859 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -68,6 +68,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 1); if (j > 0) { @@ -88,10 +120,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -246,12 +280,149 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a1 + 0 + lda); + A6 = *(a1 + 1 + lda); + A7 = *(a2 + 0 + lda); + A8 = *(a2 + 1 + lda); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b1 + 0 + lda); + B6 = *(b1 + 1 + lda); + B7 = *(b2 + 0 + lda); + B8 = *(b2 + 1 + lda); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A5; + *(b2 + 1 + lda) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B5; + *(a2 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + *(b1 + 0 + lda) = A7; + *(b1 + 1 + lda) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } + + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a1 + 0 + lda); @@ -293,10 +464,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -384,12 +557,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index 4dc559895..c63a8e2e0 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -76,6 +76,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { @@ -107,10 +139,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -366,12 +400,260 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a7 -= 4; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + A9 = *(a5 + 0); + A10 = *(a5 + 1); + A11 = *(a6 + 0); + A12 = *(a6 + 1); + A13 = *(a7 + 0); + A14 = *(a7 + 1); + A15 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + B9 = *(b5 + 0); + B10 = *(b5 + 1); + B11 = *(b6 + 0); + B12 = *(b6 + 1); + B13 = *(b7 + 0); + B14 = *(b7 + 1); + B15 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A9; + *(b6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A13; + *(b8 + 1) = A14; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B9; + *(a6 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B13; + *(a8 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(b5 + 0) = A11; + *(b5 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + *(b7 + 0) = A15; + *(b7 + 1) = A16; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } + +#ifndef MINUS + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; +#else + a1 -= 4; + a3 -= 4; + a5 -= 4; + a7 -= 4; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -435,37 +717,205 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b3 = b1 + lda; b4 = b2 + lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *(a1 + 0); - A2 = *(a1 + 1); - A3 = *(a2 + 0); - A4 = *(a2 + 1); - - A5 = *(a3 + 0); - A6 = *(a3 + 1); - A7 = *(a4 + 0); - A8 = *(a4 + 1); + i = (rows >> 1); + i--; - B1 = *(b1 + 0); - B2 = *(b1 + 1); - B3 = *(b2 + 0); - B4 = *(b2 + 1); + //Loop pipeline + //Main Loop + while (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); - B5 = *(b3 + 0); - B6 = *(b3 + 1); - B7 = *(b4 + 0); - B8 = *(b4 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); - ip1 = *piv * 2; - piv += incx; - ip2 = *piv * 2; - piv += incx; + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); - if (b1 == a1) { + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + +#ifndef MINUS + a1 += 4; + a3 += 4; +#else + a1 -= 4; + a3 -= 4; +#endif + i --; + } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; @@ -474,122 +924,96 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; } else - if (b2 != a2) { + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; *(a4 + 0) = B7; *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - } else { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b2 + 0) = A1; - *(b2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b4 + 0) = A5; - *(b4 + 1) = A6; - } - } - } else { - if (b2 == a1) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B1; - *(a2 + 1) = B2; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B5; - *(a4 + 1) = B6; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - } else - if (b2 == a2) { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - } else - if (b2 == b1) { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(b1 + 0) = A3; - *(b1 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - *(b3 + 0) = A7; - *(b3 + 1) = A8; - } else { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(b2 + 0) = A3; - *(b2 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - *(b4 + 0) = A7; - *(b4 + 1) = A8; - } - } - - b1 = a + ip1; - b2 = a + ip2; - - b3 = b1 + lda; - b4 = b2 + lda; + } #ifndef MINUS - a1 += 4; - a3 += 4; + a1 += 4; + a3 += 4; #else - a1 -= 4; - a3 -= 4; + a1 -= 4; + a3 -= 4; #endif - i --; - } while (i > 0); - } - - i = ((k2 - k1) & 1); + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + + b1 = a + ip1; + b3 = b1 + lda; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -629,10 +1053,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -720,12 +1146,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/x86/Makefile b/lapack/laswp/x86/Makefile index 105ec4027..434c82a84 100644 --- a/lapack/laswp/x86/Makefile +++ b/lapack/laswp/x86/Makefile @@ -16,12 +16,17 @@ LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/lapack/laswp/x86_64/Makefile b/lapack/laswp/x86_64/Makefile index ba07dcf4f..e6dae5344 100644 --- a/lapack/laswp/x86_64/Makefile +++ b/lapack/laswp/x86_64/Makefile @@ -21,12 +21,17 @@ LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/lapack/potrf/potrf_L_single.c b/lapack/potrf/potrf_L_single.c index b88f8fc7a..d6d143623 100644 --- a/lapack/potrf/potrf_L_single.c +++ b/lapack/potrf/potrf_L_single.c @@ -66,7 +66,9 @@ static FLOAT dm1 = -1.; #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) -#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +//leave some space for GEMM_ALIGN in sb2 +#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ) #if 0 #define SHARED_ARRAY @@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, sa, sb2, a + (is + js * lda) * COMPSIZE, lda, - - is + js); + is - js); #endif } diff --git a/make.inc b/make.inc index 30004233f..01b9bde92 100644 --- a/make.inc +++ b/make.inc @@ -4,7 +4,7 @@ DRVOPTS = $(OPTS) LOADER = $(FORTRAN) TIMER = NONE ARCHFLAGS= -ru -RANLIB = ranlib +#RANLIB = ranlib BLASLIB = TMGLIB = tmglib.a EIGSRCLIB = eigsrc.a diff --git a/openblas_config_template.h b/openblas_config_template.h index 8bf972593..cf2c037cc 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -39,3 +39,43 @@ typedef int blasint; #define FLOATRET float #endif #endif + +/* Inclusion of a standard header file is needed for definition of __STDC_* + predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs + as a side effect of including either or . */ +#include + +/* C99 supports complex floating numbers natively, which GCC also offers as an + extension since version 3.0. If neither are available, use a compatible + structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) + #define OPENBLAS_COMPLEX_C99 + #include + typedef float _Complex openblas_complex_float; + typedef double _Complex openblas_complex_double; + typedef xdouble _Complex openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_complex_float_real(z) (creal(z)) + #define openblas_complex_float_imag(z) (cimag(z)) + #define openblas_complex_double_real(z) (creal(z)) + #define openblas_complex_double_imag(z) (cimag(z)) + #define openblas_complex_xdouble_real(z) (creal(z)) + #define openblas_complex_xdouble_imag(z) (cimag(z)) +#else + #define OPENBLAS_COMPLEX_STRUCT + typedef struct { float real, imag; } openblas_complex_float; + typedef struct { double real, imag; } openblas_complex_double; + typedef struct { xdouble real, imag; } openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) {(real), (imag)} + #define openblas_make_complex_double(real, imag) {(real), (imag)} + #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} + #define openblas_complex_float_real(z) ((z).real) + #define openblas_complex_float_imag(z) ((z).imag) + #define openblas_complex_double_real(z) ((z).real) + #define openblas_complex_double_imag(z) ((z).imag) + #define openblas_complex_xdouble_real(z) ((z).real) + #define openblas_complex_xdouble_imag(z) ((z).imag) +#endif diff --git a/param.h b/param.h index 72d721d4e..5b6a19ad5 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define SNUMOPT 8 #define DNUMOPT 4 @@ -913,6 +913,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef SANDYBRIDGE + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +//#define SGEMM_DEFAULT_R 1024 + +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +//#define DGEMM_DEFAULT_R 1024 + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 128 +//#define CGEMM_DEFAULT_R cgemm_r +#define CGEMM_DEFAULT_R 1024 + +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +//#define ZGEMM_DEFAULT_R 1024 + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + + + #ifdef ATOM #define SNUMOPT 2 @@ -1404,7 +1482,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_THREAD gemm_thread_mn #endif -#if defined(SPARC) && defined(V9) +#if (defined(SPARC) && defined(V9)) || defined(__sparc_v9__) #define SNUMOPT 2 #define DNUMOPT 2 @@ -1586,26 +1664,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #endif -#define SGEMM_P sgemm_p -#define DGEMM_P dgemm_p -#define QGEMM_P qgemm_p -#define CGEMM_P cgemm_p -#define ZGEMM_P zgemm_p -#define XGEMM_P xgemm_p +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p -#define SGEMM_R sgemm_r -#define DGEMM_R dgemm_r -#define QGEMM_R qgemm_r -#define CGEMM_R cgemm_r -#define ZGEMM_R zgemm_r -#define XGEMM_R xgemm_r +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r -#define SGEMM_Q 128 -#define DGEMM_Q 128 -#define QGEMM_Q 128 -#define CGEMM_Q 128 -#define ZGEMM_Q 128 -#define XGEMM_Q 128 +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 #define SYMV_P 16 diff --git a/patch.for_lapack-3.4.0 b/patch.for_lapack-3.4.0 index 9d3cd5f31..a3dc9b8a9 100644 --- a/patch.for_lapack-3.4.0 +++ b/patch.for_lapack-3.4.0 @@ -887,3 +887,2070 @@ diff -ruN lapack-3.4.0.old/TESTING/LIN/Makefile lapack-3.4.0/TESTING/LIN/Makefil ../xlintsts: xlintsts mv xlintsts $@ + +diff -ruN lapack-3.4.0.old/lapacke/src/Makefile lapack-3.4.0/lapacke/src/Makefile +--- lapack-3.4.0.old/lapacke/src/Makefile 2011-11-10 06:56:15.000000000 +0800 ++++ lapack-3.4.0/lapacke/src/Makefile 2012-04-13 22:37:09.000000000 +0800 +@@ -35,12 +35,2060 @@ + include ../make.inc + + C_FILES := $(wildcard *.c) +-OBJ_FILES := $(C_FILES:.c=.o) ++ ++LAPACKE_OBJS := \ ++lapacke_cbbcsd.o \ ++lapacke_cbbcsd_work.o \ ++lapacke_cbdsqr.o \ ++lapacke_cbdsqr_work.o \ ++lapacke_cgbbrd.o \ ++lapacke_cgbbrd_work.o \ ++lapacke_cgbcon.o \ ++lapacke_cgbcon_work.o \ ++lapacke_cgbequb.o \ ++lapacke_cgbequb_work.o \ ++lapacke_cgbequ.o \ ++lapacke_cgbequ_work.o \ ++lapacke_cgbrfs.o \ ++lapacke_cgbrfs_work.o \ ++lapacke_cgbsv.o \ ++lapacke_cgbsv_work.o \ ++lapacke_cgbsvx.o \ ++lapacke_cgbsvx_work.o \ ++lapacke_cgbtrf.o \ ++lapacke_cgbtrf_work.o \ ++lapacke_cgbtrs.o \ ++lapacke_cgbtrs_work.o \ ++lapacke_cgebak.o \ ++lapacke_cgebak_work.o \ ++lapacke_cgebal.o \ ++lapacke_cgebal_work.o \ ++lapacke_cgebrd.o \ ++lapacke_cgebrd_work.o \ ++lapacke_cgecon.o \ ++lapacke_cgecon_work.o \ ++lapacke_cgeequb.o \ ++lapacke_cgeequb_work.o \ ++lapacke_cgeequ.o \ ++lapacke_cgeequ_work.o \ ++lapacke_cgees.o \ ++lapacke_cgees_work.o \ ++lapacke_cgeesx.o \ ++lapacke_cgeesx_work.o \ ++lapacke_cgeev.o \ ++lapacke_cgeev_work.o \ ++lapacke_cgeevx.o \ ++lapacke_cgeevx_work.o \ ++lapacke_cgehrd.o \ ++lapacke_cgehrd_work.o \ ++lapacke_cgelq2.o \ ++lapacke_cgelq2_work.o \ ++lapacke_cgelqf.o \ ++lapacke_cgelqf_work.o \ ++lapacke_cgels.o \ ++lapacke_cgelsd.o \ ++lapacke_cgelsd_work.o \ ++lapacke_cgelss.o \ ++lapacke_cgelss_work.o \ ++lapacke_cgels_work.o \ ++lapacke_cgelsy.o \ ++lapacke_cgelsy_work.o \ ++lapacke_cgemqrt.o \ ++lapacke_cgemqrt_work.o \ ++lapacke_cgeqlf.o \ ++lapacke_cgeqlf_work.o \ ++lapacke_cgeqp3.o \ ++lapacke_cgeqp3_work.o \ ++lapacke_cgeqpf.o \ ++lapacke_cgeqpf_work.o \ ++lapacke_cgeqr2.o \ ++lapacke_cgeqr2_work.o \ ++lapacke_cgeqrf.o \ ++lapacke_cgeqrfp.o \ ++lapacke_cgeqrfp_work.o \ ++lapacke_cgeqrf_work.o \ ++lapacke_cgeqrt2.o \ ++lapacke_cgeqrt2_work.o \ ++lapacke_cgeqrt3.o \ ++lapacke_cgeqrt3_work.o \ ++lapacke_cgeqrt.o \ ++lapacke_cgeqrt_work.o \ ++lapacke_cgerfs.o \ ++lapacke_cgerfs_work.o \ ++lapacke_cgerqf.o \ ++lapacke_cgerqf_work.o \ ++lapacke_cgesdd.o \ ++lapacke_cgesdd_work.o \ ++lapacke_cgesv.o \ ++lapacke_cgesvd.o \ ++lapacke_cgesvd_work.o \ ++lapacke_cgesv_work.o \ ++lapacke_cgesvx.o \ ++lapacke_cgesvx_work.o \ ++lapacke_cgetf2.o \ ++lapacke_cgetf2_work.o \ ++lapacke_cgetrf.o \ ++lapacke_cgetrf_work.o \ ++lapacke_cgetri.o \ ++lapacke_cgetri_work.o \ ++lapacke_cgetrs.o \ ++lapacke_cgetrs_work.o \ ++lapacke_cggbak.o \ ++lapacke_cggbak_work.o \ ++lapacke_cggbal.o \ ++lapacke_cggbal_work.o \ ++lapacke_cgges.o \ ++lapacke_cgges_work.o \ ++lapacke_cggesx.o \ ++lapacke_cggesx_work.o \ ++lapacke_cggev.o \ ++lapacke_cggev_work.o \ ++lapacke_cggevx.o \ ++lapacke_cggevx_work.o \ ++lapacke_cggglm.o \ ++lapacke_cggglm_work.o \ ++lapacke_cgghrd.o \ ++lapacke_cgghrd_work.o \ ++lapacke_cgglse.o \ ++lapacke_cgglse_work.o \ ++lapacke_cggqrf.o \ ++lapacke_cggqrf_work.o \ ++lapacke_cggrqf.o \ ++lapacke_cggrqf_work.o \ ++lapacke_cggsvd.o \ ++lapacke_cggsvd_work.o \ ++lapacke_cggsvp.o \ ++lapacke_cggsvp_work.o \ ++lapacke_cgtcon.o \ ++lapacke_cgtcon_work.o \ ++lapacke_cgtrfs.o \ ++lapacke_cgtrfs_work.o \ ++lapacke_cgtsv.o \ ++lapacke_cgtsv_work.o \ ++lapacke_cgtsvx.o \ ++lapacke_cgtsvx_work.o \ ++lapacke_cgttrf.o \ ++lapacke_cgttrf_work.o \ ++lapacke_cgttrs.o \ ++lapacke_cgttrs_work.o \ ++lapacke_chbev.o \ ++lapacke_chbevd.o \ ++lapacke_chbevd_work.o \ ++lapacke_chbev_work.o \ ++lapacke_chbevx.o \ ++lapacke_chbevx_work.o \ ++lapacke_chbgst.o \ ++lapacke_chbgst_work.o \ ++lapacke_chbgv.o \ ++lapacke_chbgvd.o \ ++lapacke_chbgvd_work.o \ ++lapacke_chbgv_work.o \ ++lapacke_chbgvx.o \ ++lapacke_chbgvx_work.o \ ++lapacke_chbtrd.o \ ++lapacke_chbtrd_work.o \ ++lapacke_checon.o \ ++lapacke_checon_work.o \ ++lapacke_cheequb.o \ ++lapacke_cheequb_work.o \ ++lapacke_cheev.o \ ++lapacke_cheevd.o \ ++lapacke_cheevd_work.o \ ++lapacke_cheevr.o \ ++lapacke_cheevr_work.o \ ++lapacke_cheev_work.o \ ++lapacke_cheevx.o \ ++lapacke_cheevx_work.o \ ++lapacke_chegst.o \ ++lapacke_chegst_work.o \ ++lapacke_chegv.o \ ++lapacke_chegvd.o \ ++lapacke_chegvd_work.o \ ++lapacke_chegv_work.o \ ++lapacke_chegvx.o \ ++lapacke_chegvx_work.o \ ++lapacke_cherfs.o \ ++lapacke_cherfs_work.o \ ++lapacke_chesv.o \ ++lapacke_chesv_work.o \ ++lapacke_chesvx.o \ ++lapacke_chesvx_work.o \ ++lapacke_cheswapr.o \ ++lapacke_cheswapr_work.o \ ++lapacke_chetrd.o \ ++lapacke_chetrd_work.o \ ++lapacke_chetrf.o \ ++lapacke_chetrf_work.o \ ++lapacke_chetri2.o \ ++lapacke_chetri2_work.o \ ++lapacke_chetri2x.o \ ++lapacke_chetri2x_work.o \ ++lapacke_chetri.o \ ++lapacke_chetri_work.o \ ++lapacke_chetrs2.o \ ++lapacke_chetrs2_work.o \ ++lapacke_chetrs.o \ ++lapacke_chetrs_work.o \ ++lapacke_chfrk.o \ ++lapacke_chfrk_work.o \ ++lapacke_chgeqz.o \ ++lapacke_chgeqz_work.o \ ++lapacke_chpcon.o \ ++lapacke_chpcon_work.o \ ++lapacke_chpev.o \ ++lapacke_chpevd.o \ ++lapacke_chpevd_work.o \ ++lapacke_chpev_work.o \ ++lapacke_chpevx.o \ ++lapacke_chpevx_work.o \ ++lapacke_chpgst.o \ ++lapacke_chpgst_work.o \ ++lapacke_chpgv.o \ ++lapacke_chpgvd.o \ ++lapacke_chpgvd_work.o \ ++lapacke_chpgv_work.o \ ++lapacke_chpgvx.o \ ++lapacke_chpgvx_work.o \ ++lapacke_chprfs.o \ ++lapacke_chprfs_work.o \ ++lapacke_chpsv.o \ ++lapacke_chpsv_work.o \ ++lapacke_chpsvx.o \ ++lapacke_chpsvx_work.o \ ++lapacke_chptrd.o \ ++lapacke_chptrd_work.o \ ++lapacke_chptrf.o \ ++lapacke_chptrf_work.o \ ++lapacke_chptri.o \ ++lapacke_chptri_work.o \ ++lapacke_chptrs.o \ ++lapacke_chptrs_work.o \ ++lapacke_chsein.o \ ++lapacke_chsein_work.o \ ++lapacke_chseqr.o \ ++lapacke_chseqr_work.o \ ++lapacke_clacgv.o \ ++lapacke_clacgv_work.o \ ++lapacke_clacpy.o \ ++lapacke_clacpy_work.o \ ++lapacke_clag2z.o \ ++lapacke_clag2z_work.o \ ++lapacke_clange.o \ ++lapacke_clange_work.o \ ++lapacke_clanhe.o \ ++lapacke_clanhe_work.o \ ++lapacke_clansy.o \ ++lapacke_clansy_work.o \ ++lapacke_clantr.o \ ++lapacke_clantr_work.o \ ++lapacke_clapmr.o \ ++lapacke_clapmr_work.o \ ++lapacke_clarfb.o \ ++lapacke_clarfb_work.o \ ++lapacke_clarfg.o \ ++lapacke_clarfg_work.o \ ++lapacke_clarft.o \ ++lapacke_clarft_work.o \ ++lapacke_clarfx.o \ ++lapacke_clarfx_work.o \ ++lapacke_clarnv.o \ ++lapacke_clarnv_work.o \ ++lapacke_claset.o \ ++lapacke_claset_work.o \ ++lapacke_claswp.o \ ++lapacke_claswp_work.o \ ++lapacke_clauum.o \ ++lapacke_clauum_work.o \ ++lapacke_cpbcon.o \ ++lapacke_cpbcon_work.o \ ++lapacke_cpbequ.o \ ++lapacke_cpbequ_work.o \ ++lapacke_cpbrfs.o \ ++lapacke_cpbrfs_work.o \ ++lapacke_cpbstf.o \ ++lapacke_cpbstf_work.o \ ++lapacke_cpbsv.o \ ++lapacke_cpbsv_work.o \ ++lapacke_cpbsvx.o \ ++lapacke_cpbsvx_work.o \ ++lapacke_cpbtrf.o \ ++lapacke_cpbtrf_work.o \ ++lapacke_cpbtrs.o \ ++lapacke_cpbtrs_work.o \ ++lapacke_cpftrf.o \ ++lapacke_cpftrf_work.o \ ++lapacke_cpftri.o \ ++lapacke_cpftri_work.o \ ++lapacke_cpftrs.o \ ++lapacke_cpftrs_work.o \ ++lapacke_cpocon.o \ ++lapacke_cpocon_work.o \ ++lapacke_cpoequb.o \ ++lapacke_cpoequb_work.o \ ++lapacke_cpoequ.o \ ++lapacke_cpoequ_work.o \ ++lapacke_cporfs.o \ ++lapacke_cporfs_work.o \ ++lapacke_cposv.o \ ++lapacke_cposv_work.o \ ++lapacke_cposvx.o \ ++lapacke_cposvx_work.o \ ++lapacke_cpotrf.o \ ++lapacke_cpotrf_work.o \ ++lapacke_cpotri.o \ ++lapacke_cpotri_work.o \ ++lapacke_cpotrs.o \ ++lapacke_cpotrs_work.o \ ++lapacke_cppcon.o \ ++lapacke_cppcon_work.o \ ++lapacke_cppequ.o \ ++lapacke_cppequ_work.o \ ++lapacke_cpprfs.o \ ++lapacke_cpprfs_work.o \ ++lapacke_cppsv.o \ ++lapacke_cppsv_work.o \ ++lapacke_cppsvx.o \ ++lapacke_cppsvx_work.o \ ++lapacke_cpptrf.o \ ++lapacke_cpptrf_work.o \ ++lapacke_cpptri.o \ ++lapacke_cpptri_work.o \ ++lapacke_cpptrs.o \ ++lapacke_cpptrs_work.o \ ++lapacke_cpstrf.o \ ++lapacke_cpstrf_work.o \ ++lapacke_cptcon.o \ ++lapacke_cptcon_work.o \ ++lapacke_cpteqr.o \ ++lapacke_cpteqr_work.o \ ++lapacke_cptrfs.o \ ++lapacke_cptrfs_work.o \ ++lapacke_cptsv.o \ ++lapacke_cptsv_work.o \ ++lapacke_cptsvx.o \ ++lapacke_cptsvx_work.o \ ++lapacke_cpttrf.o \ ++lapacke_cpttrf_work.o \ ++lapacke_cpttrs.o \ ++lapacke_cpttrs_work.o \ ++lapacke_cspcon.o \ ++lapacke_cspcon_work.o \ ++lapacke_csprfs.o \ ++lapacke_csprfs_work.o \ ++lapacke_cspsv.o \ ++lapacke_cspsv_work.o \ ++lapacke_cspsvx.o \ ++lapacke_cspsvx_work.o \ ++lapacke_csptrf.o \ ++lapacke_csptrf_work.o \ ++lapacke_csptri.o \ ++lapacke_csptri_work.o \ ++lapacke_csptrs.o \ ++lapacke_csptrs_work.o \ ++lapacke_cstedc.o \ ++lapacke_cstedc_work.o \ ++lapacke_cstegr.o \ ++lapacke_cstegr_work.o \ ++lapacke_cstein.o \ ++lapacke_cstein_work.o \ ++lapacke_cstemr.o \ ++lapacke_cstemr_work.o \ ++lapacke_csteqr.o \ ++lapacke_csteqr_work.o \ ++lapacke_csycon.o \ ++lapacke_csyconv.o \ ++lapacke_csyconv_work.o \ ++lapacke_csycon_work.o \ ++lapacke_csyequb.o \ ++lapacke_csyequb_work.o \ ++lapacke_csyrfs.o \ ++lapacke_csyrfs_work.o \ ++lapacke_csysv.o \ ++lapacke_csysv_work.o \ ++lapacke_csysvx.o \ ++lapacke_csysvx_work.o \ ++lapacke_csyswapr.o \ ++lapacke_csyswapr_work.o \ ++lapacke_csytrf.o \ ++lapacke_csytrf_work.o \ ++lapacke_csytri2.o \ ++lapacke_csytri2_work.o \ ++lapacke_csytri2x.o \ ++lapacke_csytri2x_work.o \ ++lapacke_csytri.o \ ++lapacke_csytri_work.o \ ++lapacke_csytrs2.o \ ++lapacke_csytrs2_work.o \ ++lapacke_csytrs.o \ ++lapacke_csytrs_work.o \ ++lapacke_ctbcon.o \ ++lapacke_ctbcon_work.o \ ++lapacke_ctbrfs.o \ ++lapacke_ctbrfs_work.o \ ++lapacke_ctbtrs.o \ ++lapacke_ctbtrs_work.o \ ++lapacke_ctfsm.o \ ++lapacke_ctfsm_work.o \ ++lapacke_ctftri.o \ ++lapacke_ctftri_work.o \ ++lapacke_ctfttp.o \ ++lapacke_ctfttp_work.o \ ++lapacke_ctfttr.o \ ++lapacke_ctfttr_work.o \ ++lapacke_ctgevc.o \ ++lapacke_ctgevc_work.o \ ++lapacke_ctgexc.o \ ++lapacke_ctgexc_work.o \ ++lapacke_ctgsen.o \ ++lapacke_ctgsen_work.o \ ++lapacke_ctgsja.o \ ++lapacke_ctgsja_work.o \ ++lapacke_ctgsna.o \ ++lapacke_ctgsna_work.o \ ++lapacke_ctgsyl.o \ ++lapacke_ctgsyl_work.o \ ++lapacke_ctpcon.o \ ++lapacke_ctpcon_work.o \ ++lapacke_ctpmqrt.o \ ++lapacke_ctpmqrt_work.o \ ++lapacke_ctpqrt2.o \ ++lapacke_ctpqrt2_work.o \ ++lapacke_ctpqrt.o \ ++lapacke_ctpqrt_work.o \ ++lapacke_ctprfb.o \ ++lapacke_ctprfb_work.o \ ++lapacke_ctprfs.o \ ++lapacke_ctprfs_work.o \ ++lapacke_ctptri.o \ ++lapacke_ctptri_work.o \ ++lapacke_ctptrs.o \ ++lapacke_ctptrs_work.o \ ++lapacke_ctpttf.o \ ++lapacke_ctpttf_work.o \ ++lapacke_ctpttr.o \ ++lapacke_ctpttr_work.o \ ++lapacke_ctrcon.o \ ++lapacke_ctrcon_work.o \ ++lapacke_ctrevc.o \ ++lapacke_ctrevc_work.o \ ++lapacke_ctrexc.o \ ++lapacke_ctrexc_work.o \ ++lapacke_ctrrfs.o \ ++lapacke_ctrrfs_work.o \ ++lapacke_ctrsen.o \ ++lapacke_ctrsen_work.o \ ++lapacke_ctrsna.o \ ++lapacke_ctrsna_work.o \ ++lapacke_ctrsyl.o \ ++lapacke_ctrsyl_work.o \ ++lapacke_ctrtri.o \ ++lapacke_ctrtri_work.o \ ++lapacke_ctrtrs.o \ ++lapacke_ctrtrs_work.o \ ++lapacke_ctrttf.o \ ++lapacke_ctrttf_work.o \ ++lapacke_ctrttp.o \ ++lapacke_ctrttp_work.o \ ++lapacke_ctzrzf.o \ ++lapacke_ctzrzf_work.o \ ++lapacke_cunbdb.o \ ++lapacke_cunbdb_work.o \ ++lapacke_cuncsd.o \ ++lapacke_cuncsd_work.o \ ++lapacke_cungbr.o \ ++lapacke_cungbr_work.o \ ++lapacke_cunghr.o \ ++lapacke_cunghr_work.o \ ++lapacke_cunglq.o \ ++lapacke_cunglq_work.o \ ++lapacke_cungql.o \ ++lapacke_cungql_work.o \ ++lapacke_cungqr.o \ ++lapacke_cungqr_work.o \ ++lapacke_cungrq.o \ ++lapacke_cungrq_work.o \ ++lapacke_cungtr.o \ ++lapacke_cungtr_work.o \ ++lapacke_cunmbr.o \ ++lapacke_cunmbr_work.o \ ++lapacke_cunmhr.o \ ++lapacke_cunmhr_work.o \ ++lapacke_cunmlq.o \ ++lapacke_cunmlq_work.o \ ++lapacke_cunmql.o \ ++lapacke_cunmql_work.o \ ++lapacke_cunmqr.o \ ++lapacke_cunmqr_work.o \ ++lapacke_cunmrq.o \ ++lapacke_cunmrq_work.o \ ++lapacke_cunmrz.o \ ++lapacke_cunmrz_work.o \ ++lapacke_cunmtr.o \ ++lapacke_cunmtr_work.o \ ++lapacke_cupgtr.o \ ++lapacke_cupgtr_work.o \ ++lapacke_cupmtr.o \ ++lapacke_cupmtr_work.o \ ++lapacke_dbbcsd.o \ ++lapacke_dbbcsd_work.o \ ++lapacke_dbdsdc.o \ ++lapacke_dbdsdc_work.o \ ++lapacke_dbdsqr.o \ ++lapacke_dbdsqr_work.o \ ++lapacke_ddisna.o \ ++lapacke_ddisna_work.o \ ++lapacke_dgbbrd.o \ ++lapacke_dgbbrd_work.o \ ++lapacke_dgbcon.o \ ++lapacke_dgbcon_work.o \ ++lapacke_dgbequb.o \ ++lapacke_dgbequb_work.o \ ++lapacke_dgbequ.o \ ++lapacke_dgbequ_work.o \ ++lapacke_dgbrfs.o \ ++lapacke_dgbrfs_work.o \ ++lapacke_dgbsv.o \ ++lapacke_dgbsv_work.o \ ++lapacke_dgbsvx.o \ ++lapacke_dgbsvx_work.o \ ++lapacke_dgbtrf.o \ ++lapacke_dgbtrf_work.o \ ++lapacke_dgbtrs.o \ ++lapacke_dgbtrs_work.o \ ++lapacke_dgebak.o \ ++lapacke_dgebak_work.o \ ++lapacke_dgebal.o \ ++lapacke_dgebal_work.o \ ++lapacke_dgebrd.o \ ++lapacke_dgebrd_work.o \ ++lapacke_dgecon.o \ ++lapacke_dgecon_work.o \ ++lapacke_dgeequb.o \ ++lapacke_dgeequb_work.o \ ++lapacke_dgeequ.o \ ++lapacke_dgeequ_work.o \ ++lapacke_dgees.o \ ++lapacke_dgees_work.o \ ++lapacke_dgeesx.o \ ++lapacke_dgeesx_work.o \ ++lapacke_dgeev.o \ ++lapacke_dgeev_work.o \ ++lapacke_dgeevx.o \ ++lapacke_dgeevx_work.o \ ++lapacke_dgehrd.o \ ++lapacke_dgehrd_work.o \ ++lapacke_dgejsv.o \ ++lapacke_dgejsv_work.o \ ++lapacke_dgelq2.o \ ++lapacke_dgelq2_work.o \ ++lapacke_dgelqf.o \ ++lapacke_dgelqf_work.o \ ++lapacke_dgels.o \ ++lapacke_dgelsd.o \ ++lapacke_dgelsd_work.o \ ++lapacke_dgelss.o \ ++lapacke_dgelss_work.o \ ++lapacke_dgels_work.o \ ++lapacke_dgelsy.o \ ++lapacke_dgelsy_work.o \ ++lapacke_dgemqrt.o \ ++lapacke_dgemqrt_work.o \ ++lapacke_dgeqlf.o \ ++lapacke_dgeqlf_work.o \ ++lapacke_dgeqp3.o \ ++lapacke_dgeqp3_work.o \ ++lapacke_dgeqpf.o \ ++lapacke_dgeqpf_work.o \ ++lapacke_dgeqr2.o \ ++lapacke_dgeqr2_work.o \ ++lapacke_dgeqrf.o \ ++lapacke_dgeqrfp.o \ ++lapacke_dgeqrfp_work.o \ ++lapacke_dgeqrf_work.o \ ++lapacke_dgeqrt2.o \ ++lapacke_dgeqrt2_work.o \ ++lapacke_dgeqrt3.o \ ++lapacke_dgeqrt3_work.o \ ++lapacke_dgeqrt.o \ ++lapacke_dgeqrt_work.o \ ++lapacke_dgerfs.o \ ++lapacke_dgerfs_work.o \ ++lapacke_dgerqf.o \ ++lapacke_dgerqf_work.o \ ++lapacke_dgesdd.o \ ++lapacke_dgesdd_work.o \ ++lapacke_dgesv.o \ ++lapacke_dgesvd.o \ ++lapacke_dgesvd_work.o \ ++lapacke_dgesvj.o \ ++lapacke_dgesvj_work.o \ ++lapacke_dgesv_work.o \ ++lapacke_dgesvx.o \ ++lapacke_dgesvx_work.o \ ++lapacke_dgetf2.o \ ++lapacke_dgetf2_work.o \ ++lapacke_dgetrf.o \ ++lapacke_dgetrf_work.o \ ++lapacke_dgetri.o \ ++lapacke_dgetri_work.o \ ++lapacke_dgetrs.o \ ++lapacke_dgetrs_work.o \ ++lapacke_dggbak.o \ ++lapacke_dggbak_work.o \ ++lapacke_dggbal.o \ ++lapacke_dggbal_work.o \ ++lapacke_dgges.o \ ++lapacke_dgges_work.o \ ++lapacke_dggesx.o \ ++lapacke_dggesx_work.o \ ++lapacke_dggev.o \ ++lapacke_dggev_work.o \ ++lapacke_dggevx.o \ ++lapacke_dggevx_work.o \ ++lapacke_dggglm.o \ ++lapacke_dggglm_work.o \ ++lapacke_dgghrd.o \ ++lapacke_dgghrd_work.o \ ++lapacke_dgglse.o \ ++lapacke_dgglse_work.o \ ++lapacke_dggqrf.o \ ++lapacke_dggqrf_work.o \ ++lapacke_dggrqf.o \ ++lapacke_dggrqf_work.o \ ++lapacke_dggsvd.o \ ++lapacke_dggsvd_work.o \ ++lapacke_dggsvp.o \ ++lapacke_dggsvp_work.o \ ++lapacke_dgtcon.o \ ++lapacke_dgtcon_work.o \ ++lapacke_dgtrfs.o \ ++lapacke_dgtrfs_work.o \ ++lapacke_dgtsv.o \ ++lapacke_dgtsv_work.o \ ++lapacke_dgtsvx.o \ ++lapacke_dgtsvx_work.o \ ++lapacke_dgttrf.o \ ++lapacke_dgttrf_work.o \ ++lapacke_dgttrs.o \ ++lapacke_dgttrs_work.o \ ++lapacke_dhgeqz.o \ ++lapacke_dhgeqz_work.o \ ++lapacke_dhsein.o \ ++lapacke_dhsein_work.o \ ++lapacke_dhseqr.o \ ++lapacke_dhseqr_work.o \ ++lapacke_dlacpy.o \ ++lapacke_dlacpy_work.o \ ++lapacke_dlag2s.o \ ++lapacke_dlag2s_work.o \ ++lapacke_dlamch.o \ ++lapacke_dlamch_work.o \ ++lapacke_dlange.o \ ++lapacke_dlange_work.o \ ++lapacke_dlansy.o \ ++lapacke_dlansy_work.o \ ++lapacke_dlantr.o \ ++lapacke_dlantr_work.o \ ++lapacke_dlapmr.o \ ++lapacke_dlapmr_work.o \ ++lapacke_dlapy2.o \ ++lapacke_dlapy2_work.o \ ++lapacke_dlapy3.o \ ++lapacke_dlapy3_work.o \ ++lapacke_dlarfb.o \ ++lapacke_dlarfb_work.o \ ++lapacke_dlarfg.o \ ++lapacke_dlarfg_work.o \ ++lapacke_dlarft.o \ ++lapacke_dlarft_work.o \ ++lapacke_dlarfx.o \ ++lapacke_dlarfx_work.o \ ++lapacke_dlarnv.o \ ++lapacke_dlarnv_work.o \ ++lapacke_dlartgp.o \ ++lapacke_dlartgp_work.o \ ++lapacke_dlartgs.o \ ++lapacke_dlartgs_work.o \ ++lapacke_dlaset.o \ ++lapacke_dlaset_work.o \ ++lapacke_dlasrt.o \ ++lapacke_dlasrt_work.o \ ++lapacke_dlaswp.o \ ++lapacke_dlaswp_work.o \ ++lapacke_dlauum.o \ ++lapacke_dlauum_work.o \ ++lapacke_dopgtr.o \ ++lapacke_dopgtr_work.o \ ++lapacke_dopmtr.o \ ++lapacke_dopmtr_work.o \ ++lapacke_dorbdb.o \ ++lapacke_dorbdb_work.o \ ++lapacke_dorcsd.o \ ++lapacke_dorcsd_work.o \ ++lapacke_dorgbr.o \ ++lapacke_dorgbr_work.o \ ++lapacke_dorghr.o \ ++lapacke_dorghr_work.o \ ++lapacke_dorglq.o \ ++lapacke_dorglq_work.o \ ++lapacke_dorgql.o \ ++lapacke_dorgql_work.o \ ++lapacke_dorgqr.o \ ++lapacke_dorgqr_work.o \ ++lapacke_dorgrq.o \ ++lapacke_dorgrq_work.o \ ++lapacke_dorgtr.o \ ++lapacke_dorgtr_work.o \ ++lapacke_dormbr.o \ ++lapacke_dormbr_work.o \ ++lapacke_dormhr.o \ ++lapacke_dormhr_work.o \ ++lapacke_dormlq.o \ ++lapacke_dormlq_work.o \ ++lapacke_dormql.o \ ++lapacke_dormql_work.o \ ++lapacke_dormqr.o \ ++lapacke_dormqr_work.o \ ++lapacke_dormrq.o \ ++lapacke_dormrq_work.o \ ++lapacke_dormrz.o \ ++lapacke_dormrz_work.o \ ++lapacke_dormtr.o \ ++lapacke_dormtr_work.o \ ++lapacke_dpbcon.o \ ++lapacke_dpbcon_work.o \ ++lapacke_dpbequ.o \ ++lapacke_dpbequ_work.o \ ++lapacke_dpbrfs.o \ ++lapacke_dpbrfs_work.o \ ++lapacke_dpbstf.o \ ++lapacke_dpbstf_work.o \ ++lapacke_dpbsv.o \ ++lapacke_dpbsv_work.o \ ++lapacke_dpbsvx.o \ ++lapacke_dpbsvx_work.o \ ++lapacke_dpbtrf.o \ ++lapacke_dpbtrf_work.o \ ++lapacke_dpbtrs.o \ ++lapacke_dpbtrs_work.o \ ++lapacke_dpftrf.o \ ++lapacke_dpftrf_work.o \ ++lapacke_dpftri.o \ ++lapacke_dpftri_work.o \ ++lapacke_dpftrs.o \ ++lapacke_dpftrs_work.o \ ++lapacke_dpocon.o \ ++lapacke_dpocon_work.o \ ++lapacke_dpoequb.o \ ++lapacke_dpoequb_work.o \ ++lapacke_dpoequ.o \ ++lapacke_dpoequ_work.o \ ++lapacke_dporfs.o \ ++lapacke_dporfs_work.o \ ++lapacke_dposv.o \ ++lapacke_dposv_work.o \ ++lapacke_dposvx.o \ ++lapacke_dposvx_work.o \ ++lapacke_dpotrf.o \ ++lapacke_dpotrf_work.o \ ++lapacke_dpotri.o \ ++lapacke_dpotri_work.o \ ++lapacke_dpotrs.o \ ++lapacke_dpotrs_work.o \ ++lapacke_dppcon.o \ ++lapacke_dppcon_work.o \ ++lapacke_dppequ.o \ ++lapacke_dppequ_work.o \ ++lapacke_dpprfs.o \ ++lapacke_dpprfs_work.o \ ++lapacke_dppsv.o \ ++lapacke_dppsv_work.o \ ++lapacke_dppsvx.o \ ++lapacke_dppsvx_work.o \ ++lapacke_dpptrf.o \ ++lapacke_dpptrf_work.o \ ++lapacke_dpptri.o \ ++lapacke_dpptri_work.o \ ++lapacke_dpptrs.o \ ++lapacke_dpptrs_work.o \ ++lapacke_dpstrf.o \ ++lapacke_dpstrf_work.o \ ++lapacke_dptcon.o \ ++lapacke_dptcon_work.o \ ++lapacke_dpteqr.o \ ++lapacke_dpteqr_work.o \ ++lapacke_dptrfs.o \ ++lapacke_dptrfs_work.o \ ++lapacke_dptsv.o \ ++lapacke_dptsv_work.o \ ++lapacke_dptsvx.o \ ++lapacke_dptsvx_work.o \ ++lapacke_dpttrf.o \ ++lapacke_dpttrf_work.o \ ++lapacke_dpttrs.o \ ++lapacke_dpttrs_work.o \ ++lapacke_dsbev.o \ ++lapacke_dsbevd.o \ ++lapacke_dsbevd_work.o \ ++lapacke_dsbev_work.o \ ++lapacke_dsbevx.o \ ++lapacke_dsbevx_work.o \ ++lapacke_dsbgst.o \ ++lapacke_dsbgst_work.o \ ++lapacke_dsbgv.o \ ++lapacke_dsbgvd.o \ ++lapacke_dsbgvd_work.o \ ++lapacke_dsbgv_work.o \ ++lapacke_dsbgvx.o \ ++lapacke_dsbgvx_work.o \ ++lapacke_dsbtrd.o \ ++lapacke_dsbtrd_work.o \ ++lapacke_dsfrk.o \ ++lapacke_dsfrk_work.o \ ++lapacke_dsgesv.o \ ++lapacke_dsgesv_work.o \ ++lapacke_dspcon.o \ ++lapacke_dspcon_work.o \ ++lapacke_dspev.o \ ++lapacke_dspevd.o \ ++lapacke_dspevd_work.o \ ++lapacke_dspev_work.o \ ++lapacke_dspevx.o \ ++lapacke_dspevx_work.o \ ++lapacke_dspgst.o \ ++lapacke_dspgst_work.o \ ++lapacke_dspgv.o \ ++lapacke_dspgvd.o \ ++lapacke_dspgvd_work.o \ ++lapacke_dspgv_work.o \ ++lapacke_dspgvx.o \ ++lapacke_dspgvx_work.o \ ++lapacke_dsposv.o \ ++lapacke_dsposv_work.o \ ++lapacke_dsprfs.o \ ++lapacke_dsprfs_work.o \ ++lapacke_dspsv.o \ ++lapacke_dspsv_work.o \ ++lapacke_dspsvx.o \ ++lapacke_dspsvx_work.o \ ++lapacke_dsptrd.o \ ++lapacke_dsptrd_work.o \ ++lapacke_dsptrf.o \ ++lapacke_dsptrf_work.o \ ++lapacke_dsptri.o \ ++lapacke_dsptri_work.o \ ++lapacke_dsptrs.o \ ++lapacke_dsptrs_work.o \ ++lapacke_dstebz.o \ ++lapacke_dstebz_work.o \ ++lapacke_dstedc.o \ ++lapacke_dstedc_work.o \ ++lapacke_dstegr.o \ ++lapacke_dstegr_work.o \ ++lapacke_dstein.o \ ++lapacke_dstein_work.o \ ++lapacke_dstemr.o \ ++lapacke_dstemr_work.o \ ++lapacke_dsteqr.o \ ++lapacke_dsteqr_work.o \ ++lapacke_dsterf.o \ ++lapacke_dsterf_work.o \ ++lapacke_dstev.o \ ++lapacke_dstevd.o \ ++lapacke_dstevd_work.o \ ++lapacke_dstevr.o \ ++lapacke_dstevr_work.o \ ++lapacke_dstev_work.o \ ++lapacke_dstevx.o \ ++lapacke_dstevx_work.o \ ++lapacke_dsycon.o \ ++lapacke_dsyconv.o \ ++lapacke_dsyconv_work.o \ ++lapacke_dsycon_work.o \ ++lapacke_dsyequb.o \ ++lapacke_dsyequb_work.o \ ++lapacke_dsyev.o \ ++lapacke_dsyevd.o \ ++lapacke_dsyevd_work.o \ ++lapacke_dsyevr.o \ ++lapacke_dsyevr_work.o \ ++lapacke_dsyev_work.o \ ++lapacke_dsyevx.o \ ++lapacke_dsyevx_work.o \ ++lapacke_dsygst.o \ ++lapacke_dsygst_work.o \ ++lapacke_dsygv.o \ ++lapacke_dsygvd.o \ ++lapacke_dsygvd_work.o \ ++lapacke_dsygv_work.o \ ++lapacke_dsygvx.o \ ++lapacke_dsygvx_work.o \ ++lapacke_dsyrfs.o \ ++lapacke_dsyrfs_work.o \ ++lapacke_dsysv.o \ ++lapacke_dsysv_work.o \ ++lapacke_dsysvx.o \ ++lapacke_dsysvx_work.o \ ++lapacke_dsyswapr.o \ ++lapacke_dsyswapr_work.o \ ++lapacke_dsytrd.o \ ++lapacke_dsytrd_work.o \ ++lapacke_dsytrf.o \ ++lapacke_dsytrf_work.o \ ++lapacke_dsytri2.o \ ++lapacke_dsytri2_work.o \ ++lapacke_dsytri2x.o \ ++lapacke_dsytri2x_work.o \ ++lapacke_dsytri.o \ ++lapacke_dsytri_work.o \ ++lapacke_dsytrs2.o \ ++lapacke_dsytrs2_work.o \ ++lapacke_dsytrs.o \ ++lapacke_dsytrs_work.o \ ++lapacke_dtbcon.o \ ++lapacke_dtbcon_work.o \ ++lapacke_dtbrfs.o \ ++lapacke_dtbrfs_work.o \ ++lapacke_dtbtrs.o \ ++lapacke_dtbtrs_work.o \ ++lapacke_dtfsm.o \ ++lapacke_dtfsm_work.o \ ++lapacke_dtftri.o \ ++lapacke_dtftri_work.o \ ++lapacke_dtfttp.o \ ++lapacke_dtfttp_work.o \ ++lapacke_dtfttr.o \ ++lapacke_dtfttr_work.o \ ++lapacke_dtgevc.o \ ++lapacke_dtgevc_work.o \ ++lapacke_dtgexc.o \ ++lapacke_dtgexc_work.o \ ++lapacke_dtgsen.o \ ++lapacke_dtgsen_work.o \ ++lapacke_dtgsja.o \ ++lapacke_dtgsja_work.o \ ++lapacke_dtgsna.o \ ++lapacke_dtgsna_work.o \ ++lapacke_dtgsyl.o \ ++lapacke_dtgsyl_work.o \ ++lapacke_dtpcon.o \ ++lapacke_dtpcon_work.o \ ++lapacke_dtpmqrt.o \ ++lapacke_dtpmqrt_work.o \ ++lapacke_dtpqrt2.o \ ++lapacke_dtpqrt2_work.o \ ++lapacke_dtpqrt.o \ ++lapacke_dtpqrt_work.o \ ++lapacke_dtprfb.o \ ++lapacke_dtprfb_work.o \ ++lapacke_dtprfs.o \ ++lapacke_dtprfs_work.o \ ++lapacke_dtptri.o \ ++lapacke_dtptri_work.o \ ++lapacke_dtptrs.o \ ++lapacke_dtptrs_work.o \ ++lapacke_dtpttf.o \ ++lapacke_dtpttf_work.o \ ++lapacke_dtpttr.o \ ++lapacke_dtpttr_work.o \ ++lapacke_dtrcon.o \ ++lapacke_dtrcon_work.o \ ++lapacke_dtrevc.o \ ++lapacke_dtrevc_work.o \ ++lapacke_dtrexc.o \ ++lapacke_dtrexc_work.o \ ++lapacke_dtrrfs.o \ ++lapacke_dtrrfs_work.o \ ++lapacke_dtrsen.o \ ++lapacke_dtrsen_work.o \ ++lapacke_dtrsna.o \ ++lapacke_dtrsna_work.o \ ++lapacke_dtrsyl.o \ ++lapacke_dtrsyl_work.o \ ++lapacke_dtrtri.o \ ++lapacke_dtrtri_work.o \ ++lapacke_dtrtrs.o \ ++lapacke_dtrtrs_work.o \ ++lapacke_dtrttf.o \ ++lapacke_dtrttf_work.o \ ++lapacke_dtrttp.o \ ++lapacke_dtrttp_work.o \ ++lapacke_dtzrzf.o \ ++lapacke_dtzrzf_work.o \ ++lapacke_sbbcsd.o \ ++lapacke_sbbcsd_work.o \ ++lapacke_sbdsdc.o \ ++lapacke_sbdsdc_work.o \ ++lapacke_sbdsqr.o \ ++lapacke_sbdsqr_work.o \ ++lapacke_sdisna.o \ ++lapacke_sdisna_work.o \ ++lapacke_sgbbrd.o \ ++lapacke_sgbbrd_work.o \ ++lapacke_sgbcon.o \ ++lapacke_sgbcon_work.o \ ++lapacke_sgbequb.o \ ++lapacke_sgbequb_work.o \ ++lapacke_sgbequ.o \ ++lapacke_sgbequ_work.o \ ++lapacke_sgbrfs.o \ ++lapacke_sgbrfs_work.o \ ++lapacke_sgbsv.o \ ++lapacke_sgbsv_work.o \ ++lapacke_sgbsvx.o \ ++lapacke_sgbsvx_work.o \ ++lapacke_sgbtrf.o \ ++lapacke_sgbtrf_work.o \ ++lapacke_sgbtrs.o \ ++lapacke_sgbtrs_work.o \ ++lapacke_sgebak.o \ ++lapacke_sgebak_work.o \ ++lapacke_sgebal.o \ ++lapacke_sgebal_work.o \ ++lapacke_sgebrd.o \ ++lapacke_sgebrd_work.o \ ++lapacke_sgecon.o \ ++lapacke_sgecon_work.o \ ++lapacke_sgeequb.o \ ++lapacke_sgeequb_work.o \ ++lapacke_sgeequ.o \ ++lapacke_sgeequ_work.o \ ++lapacke_sgees.o \ ++lapacke_sgees_work.o \ ++lapacke_sgeesx.o \ ++lapacke_sgeesx_work.o \ ++lapacke_sgeev.o \ ++lapacke_sgeev_work.o \ ++lapacke_sgeevx.o \ ++lapacke_sgeevx_work.o \ ++lapacke_sgehrd.o \ ++lapacke_sgehrd_work.o \ ++lapacke_sgejsv.o \ ++lapacke_sgejsv_work.o \ ++lapacke_sgelq2.o \ ++lapacke_sgelq2_work.o \ ++lapacke_sgelqf.o \ ++lapacke_sgelqf_work.o \ ++lapacke_sgels.o \ ++lapacke_sgelsd.o \ ++lapacke_sgelsd_work.o \ ++lapacke_sgelss.o \ ++lapacke_sgelss_work.o \ ++lapacke_sgels_work.o \ ++lapacke_sgelsy.o \ ++lapacke_sgelsy_work.o \ ++lapacke_sgemqrt.o \ ++lapacke_sgemqrt_work.o \ ++lapacke_sgeqlf.o \ ++lapacke_sgeqlf_work.o \ ++lapacke_sgeqp3.o \ ++lapacke_sgeqp3_work.o \ ++lapacke_sgeqpf.o \ ++lapacke_sgeqpf_work.o \ ++lapacke_sgeqr2.o \ ++lapacke_sgeqr2_work.o \ ++lapacke_sgeqrf.o \ ++lapacke_sgeqrfp.o \ ++lapacke_sgeqrfp_work.o \ ++lapacke_sgeqrf_work.o \ ++lapacke_sgeqrt2.o \ ++lapacke_sgeqrt2_work.o \ ++lapacke_sgeqrt3.o \ ++lapacke_sgeqrt3_work.o \ ++lapacke_sgeqrt.o \ ++lapacke_sgeqrt_work.o \ ++lapacke_sgerfs.o \ ++lapacke_sgerfs_work.o \ ++lapacke_sgerqf.o \ ++lapacke_sgerqf_work.o \ ++lapacke_sgesdd.o \ ++lapacke_sgesdd_work.o \ ++lapacke_sgesv.o \ ++lapacke_sgesvd.o \ ++lapacke_sgesvd_work.o \ ++lapacke_sgesvj.o \ ++lapacke_sgesvj_work.o \ ++lapacke_sgesv_work.o \ ++lapacke_sgesvx.o \ ++lapacke_sgesvx_work.o \ ++lapacke_sgetf2.o \ ++lapacke_sgetf2_work.o \ ++lapacke_sgetrf.o \ ++lapacke_sgetrf_work.o \ ++lapacke_sgetri.o \ ++lapacke_sgetri_work.o \ ++lapacke_sgetrs.o \ ++lapacke_sgetrs_work.o \ ++lapacke_sggbak.o \ ++lapacke_sggbak_work.o \ ++lapacke_sggbal.o \ ++lapacke_sggbal_work.o \ ++lapacke_sgges.o \ ++lapacke_sgges_work.o \ ++lapacke_sggesx.o \ ++lapacke_sggesx_work.o \ ++lapacke_sggev.o \ ++lapacke_sggev_work.o \ ++lapacke_sggevx.o \ ++lapacke_sggevx_work.o \ ++lapacke_sggglm.o \ ++lapacke_sggglm_work.o \ ++lapacke_sgghrd.o \ ++lapacke_sgghrd_work.o \ ++lapacke_sgglse.o \ ++lapacke_sgglse_work.o \ ++lapacke_sggqrf.o \ ++lapacke_sggqrf_work.o \ ++lapacke_sggrqf.o \ ++lapacke_sggrqf_work.o \ ++lapacke_sggsvd.o \ ++lapacke_sggsvd_work.o \ ++lapacke_sggsvp.o \ ++lapacke_sggsvp_work.o \ ++lapacke_sgtcon.o \ ++lapacke_sgtcon_work.o \ ++lapacke_sgtrfs.o \ ++lapacke_sgtrfs_work.o \ ++lapacke_sgtsv.o \ ++lapacke_sgtsv_work.o \ ++lapacke_sgtsvx.o \ ++lapacke_sgtsvx_work.o \ ++lapacke_sgttrf.o \ ++lapacke_sgttrf_work.o \ ++lapacke_sgttrs.o \ ++lapacke_sgttrs_work.o \ ++lapacke_shgeqz.o \ ++lapacke_shgeqz_work.o \ ++lapacke_shsein.o \ ++lapacke_shsein_work.o \ ++lapacke_shseqr.o \ ++lapacke_shseqr_work.o \ ++lapacke_slacpy.o \ ++lapacke_slacpy_work.o \ ++lapacke_slag2d.o \ ++lapacke_slag2d_work.o \ ++lapacke_slamch.o \ ++lapacke_slamch_work.o \ ++lapacke_slange.o \ ++lapacke_slange_work.o \ ++lapacke_slansy.o \ ++lapacke_slansy_work.o \ ++lapacke_slantr.o \ ++lapacke_slantr_work.o \ ++lapacke_slapmr.o \ ++lapacke_slapmr_work.o \ ++lapacke_slapy2.o \ ++lapacke_slapy2_work.o \ ++lapacke_slapy3.o \ ++lapacke_slapy3_work.o \ ++lapacke_slarfb.o \ ++lapacke_slarfb_work.o \ ++lapacke_slarfg.o \ ++lapacke_slarfg_work.o \ ++lapacke_slarft.o \ ++lapacke_slarft_work.o \ ++lapacke_slarfx.o \ ++lapacke_slarfx_work.o \ ++lapacke_slarnv.o \ ++lapacke_slarnv_work.o \ ++lapacke_slartgp.o \ ++lapacke_slartgp_work.o \ ++lapacke_slartgs.o \ ++lapacke_slartgs_work.o \ ++lapacke_slaset.o \ ++lapacke_slaset_work.o \ ++lapacke_slasrt.o \ ++lapacke_slasrt_work.o \ ++lapacke_slaswp.o \ ++lapacke_slaswp_work.o \ ++lapacke_slauum.o \ ++lapacke_slauum_work.o \ ++lapacke_sopgtr.o \ ++lapacke_sopgtr_work.o \ ++lapacke_sopmtr.o \ ++lapacke_sopmtr_work.o \ ++lapacke_sorbdb.o \ ++lapacke_sorbdb_work.o \ ++lapacke_sorcsd.o \ ++lapacke_sorcsd_work.o \ ++lapacke_sorgbr.o \ ++lapacke_sorgbr_work.o \ ++lapacke_sorghr.o \ ++lapacke_sorghr_work.o \ ++lapacke_sorglq.o \ ++lapacke_sorglq_work.o \ ++lapacke_sorgql.o \ ++lapacke_sorgql_work.o \ ++lapacke_sorgqr.o \ ++lapacke_sorgqr_work.o \ ++lapacke_sorgrq.o \ ++lapacke_sorgrq_work.o \ ++lapacke_sorgtr.o \ ++lapacke_sorgtr_work.o \ ++lapacke_sormbr.o \ ++lapacke_sormbr_work.o \ ++lapacke_sormhr.o \ ++lapacke_sormhr_work.o \ ++lapacke_sormlq.o \ ++lapacke_sormlq_work.o \ ++lapacke_sormql.o \ ++lapacke_sormql_work.o \ ++lapacke_sormqr.o \ ++lapacke_sormqr_work.o \ ++lapacke_sormrq.o \ ++lapacke_sormrq_work.o \ ++lapacke_sormrz.o \ ++lapacke_sormrz_work.o \ ++lapacke_sormtr.o \ ++lapacke_sormtr_work.o \ ++lapacke_spbcon.o \ ++lapacke_spbcon_work.o \ ++lapacke_spbequ.o \ ++lapacke_spbequ_work.o \ ++lapacke_spbrfs.o \ ++lapacke_spbrfs_work.o \ ++lapacke_spbstf.o \ ++lapacke_spbstf_work.o \ ++lapacke_spbsv.o \ ++lapacke_spbsv_work.o \ ++lapacke_spbsvx.o \ ++lapacke_spbsvx_work.o \ ++lapacke_spbtrf.o \ ++lapacke_spbtrf_work.o \ ++lapacke_spbtrs.o \ ++lapacke_spbtrs_work.o \ ++lapacke_spftrf.o \ ++lapacke_spftrf_work.o \ ++lapacke_spftri.o \ ++lapacke_spftri_work.o \ ++lapacke_spftrs.o \ ++lapacke_spftrs_work.o \ ++lapacke_spocon.o \ ++lapacke_spocon_work.o \ ++lapacke_spoequb.o \ ++lapacke_spoequb_work.o \ ++lapacke_spoequ.o \ ++lapacke_spoequ_work.o \ ++lapacke_sporfs.o \ ++lapacke_sporfs_work.o \ ++lapacke_sposv.o \ ++lapacke_sposv_work.o \ ++lapacke_sposvx.o \ ++lapacke_sposvx_work.o \ ++lapacke_spotrf.o \ ++lapacke_spotrf_work.o \ ++lapacke_spotri.o \ ++lapacke_spotri_work.o \ ++lapacke_spotrs.o \ ++lapacke_spotrs_work.o \ ++lapacke_sppcon.o \ ++lapacke_sppcon_work.o \ ++lapacke_sppequ.o \ ++lapacke_sppequ_work.o \ ++lapacke_spprfs.o \ ++lapacke_spprfs_work.o \ ++lapacke_sppsv.o \ ++lapacke_sppsv_work.o \ ++lapacke_sppsvx.o \ ++lapacke_sppsvx_work.o \ ++lapacke_spptrf.o \ ++lapacke_spptrf_work.o \ ++lapacke_spptri.o \ ++lapacke_spptri_work.o \ ++lapacke_spptrs.o \ ++lapacke_spptrs_work.o \ ++lapacke_spstrf.o \ ++lapacke_spstrf_work.o \ ++lapacke_sptcon.o \ ++lapacke_sptcon_work.o \ ++lapacke_spteqr.o \ ++lapacke_spteqr_work.o \ ++lapacke_sptrfs.o \ ++lapacke_sptrfs_work.o \ ++lapacke_sptsv.o \ ++lapacke_sptsv_work.o \ ++lapacke_sptsvx.o \ ++lapacke_sptsvx_work.o \ ++lapacke_spttrf.o \ ++lapacke_spttrf_work.o \ ++lapacke_spttrs.o \ ++lapacke_spttrs_work.o \ ++lapacke_ssbev.o \ ++lapacke_ssbevd.o \ ++lapacke_ssbevd_work.o \ ++lapacke_ssbev_work.o \ ++lapacke_ssbevx.o \ ++lapacke_ssbevx_work.o \ ++lapacke_ssbgst.o \ ++lapacke_ssbgst_work.o \ ++lapacke_ssbgv.o \ ++lapacke_ssbgvd.o \ ++lapacke_ssbgvd_work.o \ ++lapacke_ssbgv_work.o \ ++lapacke_ssbgvx.o \ ++lapacke_ssbgvx_work.o \ ++lapacke_ssbtrd.o \ ++lapacke_ssbtrd_work.o \ ++lapacke_ssfrk.o \ ++lapacke_ssfrk_work.o \ ++lapacke_sspcon.o \ ++lapacke_sspcon_work.o \ ++lapacke_sspev.o \ ++lapacke_sspevd.o \ ++lapacke_sspevd_work.o \ ++lapacke_sspev_work.o \ ++lapacke_sspevx.o \ ++lapacke_sspevx_work.o \ ++lapacke_sspgst.o \ ++lapacke_sspgst_work.o \ ++lapacke_sspgv.o \ ++lapacke_sspgvd.o \ ++lapacke_sspgvd_work.o \ ++lapacke_sspgv_work.o \ ++lapacke_sspgvx.o \ ++lapacke_sspgvx_work.o \ ++lapacke_ssprfs.o \ ++lapacke_ssprfs_work.o \ ++lapacke_sspsv.o \ ++lapacke_sspsv_work.o \ ++lapacke_sspsvx.o \ ++lapacke_sspsvx_work.o \ ++lapacke_ssptrd.o \ ++lapacke_ssptrd_work.o \ ++lapacke_ssptrf.o \ ++lapacke_ssptrf_work.o \ ++lapacke_ssptri.o \ ++lapacke_ssptri_work.o \ ++lapacke_ssptrs.o \ ++lapacke_ssptrs_work.o \ ++lapacke_sstebz.o \ ++lapacke_sstebz_work.o \ ++lapacke_sstedc.o \ ++lapacke_sstedc_work.o \ ++lapacke_sstegr.o \ ++lapacke_sstegr_work.o \ ++lapacke_sstein.o \ ++lapacke_sstein_work.o \ ++lapacke_sstemr.o \ ++lapacke_sstemr_work.o \ ++lapacke_ssteqr.o \ ++lapacke_ssteqr_work.o \ ++lapacke_ssterf.o \ ++lapacke_ssterf_work.o \ ++lapacke_sstev.o \ ++lapacke_sstevd.o \ ++lapacke_sstevd_work.o \ ++lapacke_sstevr.o \ ++lapacke_sstevr_work.o \ ++lapacke_sstev_work.o \ ++lapacke_sstevx.o \ ++lapacke_sstevx_work.o \ ++lapacke_ssycon.o \ ++lapacke_ssyconv.o \ ++lapacke_ssyconv_work.o \ ++lapacke_ssycon_work.o \ ++lapacke_ssyequb.o \ ++lapacke_ssyequb_work.o \ ++lapacke_ssyev.o \ ++lapacke_ssyevd.o \ ++lapacke_ssyevd_work.o \ ++lapacke_ssyevr.o \ ++lapacke_ssyevr_work.o \ ++lapacke_ssyev_work.o \ ++lapacke_ssyevx.o \ ++lapacke_ssyevx_work.o \ ++lapacke_ssygst.o \ ++lapacke_ssygst_work.o \ ++lapacke_ssygv.o \ ++lapacke_ssygvd.o \ ++lapacke_ssygvd_work.o \ ++lapacke_ssygv_work.o \ ++lapacke_ssygvx.o \ ++lapacke_ssygvx_work.o \ ++lapacke_ssyrfs.o \ ++lapacke_ssyrfs_work.o \ ++lapacke_ssysv.o \ ++lapacke_ssysv_work.o \ ++lapacke_ssysvx.o \ ++lapacke_ssysvx_work.o \ ++lapacke_ssyswapr.o \ ++lapacke_ssyswapr_work.o \ ++lapacke_ssytrd.o \ ++lapacke_ssytrd_work.o \ ++lapacke_ssytrf.o \ ++lapacke_ssytrf_work.o \ ++lapacke_ssytri2.o \ ++lapacke_ssytri2_work.o \ ++lapacke_ssytri2x.o \ ++lapacke_ssytri2x_work.o \ ++lapacke_ssytri.o \ ++lapacke_ssytri_work.o \ ++lapacke_ssytrs2.o \ ++lapacke_ssytrs2_work.o \ ++lapacke_ssytrs.o \ ++lapacke_ssytrs_work.o \ ++lapacke_stbcon.o \ ++lapacke_stbcon_work.o \ ++lapacke_stbrfs.o \ ++lapacke_stbrfs_work.o \ ++lapacke_stbtrs.o \ ++lapacke_stbtrs_work.o \ ++lapacke_stfsm.o \ ++lapacke_stfsm_work.o \ ++lapacke_stftri.o \ ++lapacke_stftri_work.o \ ++lapacke_stfttp.o \ ++lapacke_stfttp_work.o \ ++lapacke_stfttr.o \ ++lapacke_stfttr_work.o \ ++lapacke_stgevc.o \ ++lapacke_stgevc_work.o \ ++lapacke_stgexc.o \ ++lapacke_stgexc_work.o \ ++lapacke_stgsen.o \ ++lapacke_stgsen_work.o \ ++lapacke_stgsja.o \ ++lapacke_stgsja_work.o \ ++lapacke_stgsna.o \ ++lapacke_stgsna_work.o \ ++lapacke_stgsyl.o \ ++lapacke_stgsyl_work.o \ ++lapacke_stpcon.o \ ++lapacke_stpcon_work.o \ ++lapacke_stpmqrt.o \ ++lapacke_stpmqrt_work.o \ ++lapacke_stpqrt2.o \ ++lapacke_stpqrt2_work.o \ ++lapacke_stprfb.o \ ++lapacke_stprfb_work.o \ ++lapacke_stprfs.o \ ++lapacke_stprfs_work.o \ ++lapacke_stptri.o \ ++lapacke_stptri_work.o \ ++lapacke_stptrs.o \ ++lapacke_stptrs_work.o \ ++lapacke_stpttf.o \ ++lapacke_stpttf_work.o \ ++lapacke_stpttr.o \ ++lapacke_stpttr_work.o \ ++lapacke_strcon.o \ ++lapacke_strcon_work.o \ ++lapacke_strevc.o \ ++lapacke_strevc_work.o \ ++lapacke_strexc.o \ ++lapacke_strexc_work.o \ ++lapacke_strrfs.o \ ++lapacke_strrfs_work.o \ ++lapacke_strsen.o \ ++lapacke_strsen_work.o \ ++lapacke_strsna.o \ ++lapacke_strsna_work.o \ ++lapacke_strsyl.o \ ++lapacke_strsyl_work.o \ ++lapacke_strtri.o \ ++lapacke_strtri_work.o \ ++lapacke_strtrs.o \ ++lapacke_strtrs_work.o \ ++lapacke_strttf.o \ ++lapacke_strttf_work.o \ ++lapacke_strttp.o \ ++lapacke_strttp_work.o \ ++lapacke_stzrzf.o \ ++lapacke_stzrzf_work.o \ ++lapacke_zbbcsd.o \ ++lapacke_zbbcsd_work.o \ ++lapacke_zbdsqr.o \ ++lapacke_zbdsqr_work.o \ ++lapacke_zcgesv.o \ ++lapacke_zcgesv_work.o \ ++lapacke_zcposv.o \ ++lapacke_zcposv_work.o \ ++lapacke_zgbbrd.o \ ++lapacke_zgbbrd_work.o \ ++lapacke_zgbcon.o \ ++lapacke_zgbcon_work.o \ ++lapacke_zgbequb.o \ ++lapacke_zgbequb_work.o \ ++lapacke_zgbequ.o \ ++lapacke_zgbequ_work.o \ ++lapacke_zgbrfs.o \ ++lapacke_zgbrfs_work.o \ ++lapacke_zgbsv.o \ ++lapacke_zgbsv_work.o \ ++lapacke_zgbsvx.o \ ++lapacke_zgbsvx_work.o \ ++lapacke_zgbtrf.o \ ++lapacke_zgbtrf_work.o \ ++lapacke_zgbtrs.o \ ++lapacke_zgbtrs_work.o \ ++lapacke_zgebak.o \ ++lapacke_zgebak_work.o \ ++lapacke_zgebal.o \ ++lapacke_zgebal_work.o \ ++lapacke_zgebrd.o \ ++lapacke_zgebrd_work.o \ ++lapacke_zgecon.o \ ++lapacke_zgecon_work.o \ ++lapacke_zgeequb.o \ ++lapacke_zgeequb_work.o \ ++lapacke_zgeequ.o \ ++lapacke_zgeequ_work.o \ ++lapacke_zgees.o \ ++lapacke_zgees_work.o \ ++lapacke_zgeesx.o \ ++lapacke_zgeesx_work.o \ ++lapacke_zgeev.o \ ++lapacke_zgeev_work.o \ ++lapacke_zgeevx.o \ ++lapacke_zgeevx_work.o \ ++lapacke_zgehrd.o \ ++lapacke_zgehrd_work.o \ ++lapacke_zgelq2.o \ ++lapacke_zgelq2_work.o \ ++lapacke_zgelqf.o \ ++lapacke_zgelqf_work.o \ ++lapacke_zgels.o \ ++lapacke_zgelsd.o \ ++lapacke_zgelsd_work.o \ ++lapacke_zgelss.o \ ++lapacke_zgelss_work.o \ ++lapacke_zgels_work.o \ ++lapacke_zgelsy.o \ ++lapacke_zgelsy_work.o \ ++lapacke_zgemqrt.o \ ++lapacke_zgemqrt_work.o \ ++lapacke_zgeqlf.o \ ++lapacke_zgeqlf_work.o \ ++lapacke_zgeqp3.o \ ++lapacke_zgeqp3_work.o \ ++lapacke_zgeqpf.o \ ++lapacke_zgeqpf_work.o \ ++lapacke_zgeqr2.o \ ++lapacke_zgeqr2_work.o \ ++lapacke_zgeqrf.o \ ++lapacke_zgeqrfp.o \ ++lapacke_zgeqrfp_work.o \ ++lapacke_zgeqrf_work.o \ ++lapacke_zgeqrt2.o \ ++lapacke_zgeqrt2_work.o \ ++lapacke_zgeqrt3.o \ ++lapacke_zgeqrt3_work.o \ ++lapacke_zgeqrt.o \ ++lapacke_zgeqrt_work.o \ ++lapacke_zgerfs.o \ ++lapacke_zgerfs_work.o \ ++lapacke_zgerqf.o \ ++lapacke_zgerqf_work.o \ ++lapacke_zgesdd.o \ ++lapacke_zgesdd_work.o \ ++lapacke_zgesv.o \ ++lapacke_zgesvd.o \ ++lapacke_zgesvd_work.o \ ++lapacke_zgesv_work.o \ ++lapacke_zgesvx.o \ ++lapacke_zgesvx_work.o \ ++lapacke_zgetf2.o \ ++lapacke_zgetf2_work.o \ ++lapacke_zgetrf.o \ ++lapacke_zgetrf_work.o \ ++lapacke_zgetri.o \ ++lapacke_zgetri_work.o \ ++lapacke_zgetrs.o \ ++lapacke_zgetrs_work.o \ ++lapacke_zggbak.o \ ++lapacke_zggbak_work.o \ ++lapacke_zggbal.o \ ++lapacke_zggbal_work.o \ ++lapacke_zgges.o \ ++lapacke_zgges_work.o \ ++lapacke_zggesx.o \ ++lapacke_zggesx_work.o \ ++lapacke_zggev.o \ ++lapacke_zggev_work.o \ ++lapacke_zggevx.o \ ++lapacke_zggevx_work.o \ ++lapacke_zggglm.o \ ++lapacke_zggglm_work.o \ ++lapacke_zgghrd.o \ ++lapacke_zgghrd_work.o \ ++lapacke_zgglse.o \ ++lapacke_zgglse_work.o \ ++lapacke_zggqrf.o \ ++lapacke_zggqrf_work.o \ ++lapacke_zggrqf.o \ ++lapacke_zggrqf_work.o \ ++lapacke_zggsvd.o \ ++lapacke_zggsvd_work.o \ ++lapacke_zggsvp.o \ ++lapacke_zggsvp_work.o \ ++lapacke_zgtcon.o \ ++lapacke_zgtcon_work.o \ ++lapacke_zgtrfs.o \ ++lapacke_zgtrfs_work.o \ ++lapacke_zgtsv.o \ ++lapacke_zgtsv_work.o \ ++lapacke_zgtsvx.o \ ++lapacke_zgtsvx_work.o \ ++lapacke_zgttrf.o \ ++lapacke_zgttrf_work.o \ ++lapacke_zgttrs.o \ ++lapacke_zgttrs_work.o \ ++lapacke_zhbev.o \ ++lapacke_zhbevd.o \ ++lapacke_zhbevd_work.o \ ++lapacke_zhbev_work.o \ ++lapacke_zhbevx.o \ ++lapacke_zhbevx_work.o \ ++lapacke_zhbgst.o \ ++lapacke_zhbgst_work.o \ ++lapacke_zhbgv.o \ ++lapacke_zhbgvd.o \ ++lapacke_zhbgvd_work.o \ ++lapacke_zhbgv_work.o \ ++lapacke_zhbgvx.o \ ++lapacke_zhbgvx_work.o \ ++lapacke_zhbtrd.o \ ++lapacke_zhbtrd_work.o \ ++lapacke_zhecon.o \ ++lapacke_zhecon_work.o \ ++lapacke_zheequb.o \ ++lapacke_zheequb_work.o \ ++lapacke_zheev.o \ ++lapacke_zheevd.o \ ++lapacke_zheevd_work.o \ ++lapacke_zheevr.o \ ++lapacke_zheevr_work.o \ ++lapacke_zheev_work.o \ ++lapacke_zheevx.o \ ++lapacke_zheevx_work.o \ ++lapacke_zhegst.o \ ++lapacke_zhegst_work.o \ ++lapacke_zhegv.o \ ++lapacke_zhegvd.o \ ++lapacke_zhegvd_work.o \ ++lapacke_zhegv_work.o \ ++lapacke_zhegvx.o \ ++lapacke_zhegvx_work.o \ ++lapacke_zherfs.o \ ++lapacke_zherfs_work.o \ ++lapacke_zhesv.o \ ++lapacke_zhesv_work.o \ ++lapacke_zhesvx.o \ ++lapacke_zhesvx_work.o \ ++lapacke_zheswapr.o \ ++lapacke_zheswapr_work.o \ ++lapacke_zhetrd.o \ ++lapacke_zhetrd_work.o \ ++lapacke_zhetrf.o \ ++lapacke_zhetrf_work.o \ ++lapacke_zhetri2.o \ ++lapacke_zhetri2_work.o \ ++lapacke_zhetri2x.o \ ++lapacke_zhetri2x_work.o \ ++lapacke_zhetri.o \ ++lapacke_zhetri_work.o \ ++lapacke_zhetrs2.o \ ++lapacke_zhetrs2_work.o \ ++lapacke_zhetrs.o \ ++lapacke_zhetrs_work.o \ ++lapacke_zhfrk.o \ ++lapacke_zhfrk_work.o \ ++lapacke_zhgeqz.o \ ++lapacke_zhgeqz_work.o \ ++lapacke_zhpcon.o \ ++lapacke_zhpcon_work.o \ ++lapacke_zhpev.o \ ++lapacke_zhpevd.o \ ++lapacke_zhpevd_work.o \ ++lapacke_zhpev_work.o \ ++lapacke_zhpevx.o \ ++lapacke_zhpevx_work.o \ ++lapacke_zhpgst.o \ ++lapacke_zhpgst_work.o \ ++lapacke_zhpgv.o \ ++lapacke_zhpgvd.o \ ++lapacke_zhpgvd_work.o \ ++lapacke_zhpgv_work.o \ ++lapacke_zhpgvx.o \ ++lapacke_zhpgvx_work.o \ ++lapacke_zhprfs.o \ ++lapacke_zhprfs_work.o \ ++lapacke_zhpsv.o \ ++lapacke_zhpsv_work.o \ ++lapacke_zhpsvx.o \ ++lapacke_zhpsvx_work.o \ ++lapacke_zhptrd.o \ ++lapacke_zhptrd_work.o \ ++lapacke_zhptrf.o \ ++lapacke_zhptrf_work.o \ ++lapacke_zhptri.o \ ++lapacke_zhptri_work.o \ ++lapacke_zhptrs.o \ ++lapacke_zhptrs_work.o \ ++lapacke_zhsein.o \ ++lapacke_zhsein_work.o \ ++lapacke_zhseqr.o \ ++lapacke_zhseqr_work.o \ ++lapacke_zlacgv.o \ ++lapacke_zlacgv_work.o \ ++lapacke_zlacpy.o \ ++lapacke_zlacpy_work.o \ ++lapacke_zlag2c.o \ ++lapacke_zlag2c_work.o \ ++lapacke_zlange.o \ ++lapacke_zlange_work.o \ ++lapacke_zlanhe.o \ ++lapacke_zlanhe_work.o \ ++lapacke_zlansy.o \ ++lapacke_zlansy_work.o \ ++lapacke_zlantr.o \ ++lapacke_zlantr_work.o \ ++lapacke_zlapmr.o \ ++lapacke_zlapmr_work.o \ ++lapacke_zlarfb.o \ ++lapacke_zlarfb_work.o \ ++lapacke_zlarfg.o \ ++lapacke_zlarfg_work.o \ ++lapacke_zlarft.o \ ++lapacke_zlarft_work.o \ ++lapacke_zlarfx.o \ ++lapacke_zlarfx_work.o \ ++lapacke_zlarnv.o \ ++lapacke_zlarnv_work.o \ ++lapacke_zlaset.o \ ++lapacke_zlaset_work.o \ ++lapacke_zlaswp.o \ ++lapacke_zlaswp_work.o \ ++lapacke_zlauum.o \ ++lapacke_zlauum_work.o \ ++lapacke_zpbcon.o \ ++lapacke_zpbcon_work.o \ ++lapacke_zpbequ.o \ ++lapacke_zpbequ_work.o \ ++lapacke_zpbrfs.o \ ++lapacke_zpbrfs_work.o \ ++lapacke_zpbstf.o \ ++lapacke_zpbstf_work.o \ ++lapacke_zpbsv.o \ ++lapacke_zpbsv_work.o \ ++lapacke_zpbsvx.o \ ++lapacke_zpbsvx_work.o \ ++lapacke_zpbtrf.o \ ++lapacke_zpbtrf_work.o \ ++lapacke_zpbtrs.o \ ++lapacke_zpbtrs_work.o \ ++lapacke_zpftrf.o \ ++lapacke_zpftrf_work.o \ ++lapacke_zpftri.o \ ++lapacke_zpftri_work.o \ ++lapacke_zpftrs.o \ ++lapacke_zpftrs_work.o \ ++lapacke_zpocon.o \ ++lapacke_zpocon_work.o \ ++lapacke_zpoequb.o \ ++lapacke_zpoequb_work.o \ ++lapacke_zpoequ.o \ ++lapacke_zpoequ_work.o \ ++lapacke_zporfs.o \ ++lapacke_zporfs_work.o \ ++lapacke_zposv.o \ ++lapacke_zposv_work.o \ ++lapacke_zposvx.o \ ++lapacke_zposvx_work.o \ ++lapacke_zpotrf.o \ ++lapacke_zpotrf_work.o \ ++lapacke_zpotri.o \ ++lapacke_zpotri_work.o \ ++lapacke_zpotrs.o \ ++lapacke_zpotrs_work.o \ ++lapacke_zppcon.o \ ++lapacke_zppcon_work.o \ ++lapacke_zppequ.o \ ++lapacke_zppequ_work.o \ ++lapacke_zpprfs.o \ ++lapacke_zpprfs_work.o \ ++lapacke_zppsv.o \ ++lapacke_zppsv_work.o \ ++lapacke_zppsvx.o \ ++lapacke_zppsvx_work.o \ ++lapacke_zpptrf.o \ ++lapacke_zpptrf_work.o \ ++lapacke_zpptri.o \ ++lapacke_zpptri_work.o \ ++lapacke_zpptrs.o \ ++lapacke_zpptrs_work.o \ ++lapacke_zpstrf.o \ ++lapacke_zpstrf_work.o \ ++lapacke_zptcon.o \ ++lapacke_zptcon_work.o \ ++lapacke_zpteqr.o \ ++lapacke_zpteqr_work.o \ ++lapacke_zptrfs.o \ ++lapacke_zptrfs_work.o \ ++lapacke_zptsv.o \ ++lapacke_zptsv_work.o \ ++lapacke_zptsvx.o \ ++lapacke_zptsvx_work.o \ ++lapacke_zpttrf.o \ ++lapacke_zpttrf_work.o \ ++lapacke_zpttrs.o \ ++lapacke_zpttrs_work.o \ ++lapacke_zspcon.o \ ++lapacke_zspcon_work.o \ ++lapacke_zsprfs.o \ ++lapacke_zsprfs_work.o \ ++lapacke_zspsv.o \ ++lapacke_zspsv_work.o \ ++lapacke_zspsvx.o \ ++lapacke_zspsvx_work.o \ ++lapacke_zsptrf.o \ ++lapacke_zsptrf_work.o \ ++lapacke_zsptri.o \ ++lapacke_zsptri_work.o \ ++lapacke_zsptrs.o \ ++lapacke_zsptrs_work.o \ ++lapacke_zstedc.o \ ++lapacke_zstedc_work.o \ ++lapacke_zstegr.o \ ++lapacke_zstegr_work.o \ ++lapacke_zstein.o \ ++lapacke_zstein_work.o \ ++lapacke_zstemr.o \ ++lapacke_zstemr_work.o \ ++lapacke_zsteqr.o \ ++lapacke_zsteqr_work.o \ ++lapacke_zsycon.o \ ++lapacke_zsyconv.o \ ++lapacke_zsyconv_work.o \ ++lapacke_zsycon_work.o \ ++lapacke_zsyequb.o \ ++lapacke_zsyequb_work.o \ ++lapacke_zsyrfs.o \ ++lapacke_zsyrfs_work.o \ ++lapacke_zsysv.o \ ++lapacke_zsysv_work.o \ ++lapacke_zsysvx.o \ ++lapacke_zsysvx_work.o \ ++lapacke_zsyswapr.o \ ++lapacke_zsyswapr_work.o \ ++lapacke_zsytrf.o \ ++lapacke_zsytrf_work.o \ ++lapacke_zsytri2.o \ ++lapacke_zsytri2_work.o \ ++lapacke_zsytri2x.o \ ++lapacke_zsytri2x_work.o \ ++lapacke_zsytri.o \ ++lapacke_zsytri_work.o \ ++lapacke_zsytrs2.o \ ++lapacke_zsytrs2_work.o \ ++lapacke_zsytrs.o \ ++lapacke_zsytrs_work.o \ ++lapacke_ztbcon.o \ ++lapacke_ztbcon_work.o \ ++lapacke_ztbrfs.o \ ++lapacke_ztbrfs_work.o \ ++lapacke_ztbtrs.o \ ++lapacke_ztbtrs_work.o \ ++lapacke_ztfsm.o \ ++lapacke_ztfsm_work.o \ ++lapacke_ztftri.o \ ++lapacke_ztftri_work.o \ ++lapacke_ztfttp.o \ ++lapacke_ztfttp_work.o \ ++lapacke_ztfttr.o \ ++lapacke_ztfttr_work.o \ ++lapacke_ztgevc.o \ ++lapacke_ztgevc_work.o \ ++lapacke_ztgexc.o \ ++lapacke_ztgexc_work.o \ ++lapacke_ztgsen.o \ ++lapacke_ztgsen_work.o \ ++lapacke_ztgsja.o \ ++lapacke_ztgsja_work.o \ ++lapacke_ztgsna.o \ ++lapacke_ztgsna_work.o \ ++lapacke_ztgsyl.o \ ++lapacke_ztgsyl_work.o \ ++lapacke_ztpcon.o \ ++lapacke_ztpcon_work.o \ ++lapacke_ztpmqrt.o \ ++lapacke_ztpmqrt_work.o \ ++lapacke_ztpqrt2.o \ ++lapacke_ztpqrt2_work.o \ ++lapacke_ztpqrt.o \ ++lapacke_ztpqrt_work.o \ ++lapacke_ztprfb.o \ ++lapacke_ztprfb_work.o \ ++lapacke_ztprfs.o \ ++lapacke_ztprfs_work.o \ ++lapacke_ztptri.o \ ++lapacke_ztptri_work.o \ ++lapacke_ztptrs.o \ ++lapacke_ztptrs_work.o \ ++lapacke_ztpttf.o \ ++lapacke_ztpttf_work.o \ ++lapacke_ztpttr.o \ ++lapacke_ztpttr_work.o \ ++lapacke_ztrcon.o \ ++lapacke_ztrcon_work.o \ ++lapacke_ztrevc.o \ ++lapacke_ztrevc_work.o \ ++lapacke_ztrexc.o \ ++lapacke_ztrexc_work.o \ ++lapacke_ztrrfs.o \ ++lapacke_ztrrfs_work.o \ ++lapacke_ztrsen.o \ ++lapacke_ztrsen_work.o \ ++lapacke_ztrsna.o \ ++lapacke_ztrsna_work.o \ ++lapacke_ztrsyl.o \ ++lapacke_ztrsyl_work.o \ ++lapacke_ztrtri.o \ ++lapacke_ztrtri_work.o \ ++lapacke_ztrtrs.o \ ++lapacke_ztrtrs_work.o \ ++lapacke_ztrttf.o \ ++lapacke_ztrttf_work.o \ ++lapacke_ztrttp.o \ ++lapacke_ztrttp_work.o \ ++lapacke_ztzrzf.o \ ++lapacke_ztzrzf_work.o \ ++lapacke_zunbdb.o \ ++lapacke_zunbdb_work.o \ ++lapacke_zuncsd.o \ ++lapacke_zuncsd_work.o \ ++lapacke_zungbr.o \ ++lapacke_zungbr_work.o \ ++lapacke_zunghr.o \ ++lapacke_zunghr_work.o \ ++lapacke_zunglq.o \ ++lapacke_zunglq_work.o \ ++lapacke_zungql.o \ ++lapacke_zungql_work.o \ ++lapacke_zungqr.o \ ++lapacke_zungqr_work.o \ ++lapacke_zungrq.o \ ++lapacke_zungrq_work.o \ ++lapacke_zungtr.o \ ++lapacke_zungtr_work.o \ ++lapacke_zunmbr.o \ ++lapacke_zunmbr_work.o \ ++lapacke_zunmhr.o \ ++lapacke_zunmhr_work.o \ ++lapacke_zunmlq.o \ ++lapacke_zunmlq_work.o \ ++lapacke_zunmql.o \ ++lapacke_zunmql_work.o \ ++lapacke_zunmqr.o \ ++lapacke_zunmqr_work.o \ ++lapacke_zunmrq.o \ ++lapacke_zunmrq_work.o \ ++lapacke_zunmrz.o \ ++lapacke_zunmrz_work.o \ ++lapacke_zunmtr.o \ ++lapacke_zunmtr_work.o \ ++lapacke_zupgtr.o \ ++lapacke_zupgtr_work.o \ ++lapacke_zupmtr.o \ ++lapacke_zupmtr_work.o ++ ++ ++LAPACKE_EXTENDEDPRECSION_OBJS := \ ++ lapacke_dgbrfsx.o lapacke_dgbrfsx_work.o lapacke_zgbrfsx.o lapacke_zgbrfsx_work.o lapacke_zsyrfsx.o \ ++lapacke_zsyrfsx_work.o \ ++lapacke_zgerfsx.o \ ++lapacke_zgerfsx_work.o \ ++lapacke_zporfsx.o \ ++lapacke_zporfsx_work.o \ ++lapacke_sgerfsx.o \ ++lapacke_sgerfsx_work.o \ ++lapacke_zgesvxx.o \ ++lapacke_zgesvxx_work.o \ ++lapacke_sgbrfsx.o \ ++lapacke_sgbrfsx_work.o \ ++lapacke_ssysvxx.o \ ++lapacke_ssysvxx_work.o \ ++lapacke_sgesvxx.o \ ++lapacke_sgesvxx_work.o \ ++lapacke_cgbsvxx.o \ ++lapacke_cgbsvxx_work.o \ ++lapacke_cporfsx.o \ ++lapacke_cporfsx_work.o \ ++lapacke_cherfsx.o \ ++lapacke_cherfsx_work.o \ ++lapacke_dporfsx.o \ ++lapacke_dporfsx_work.o \ ++lapacke_sposvxx.o \ ++lapacke_sposvxx_work.o \ ++lapacke_sgbsvxx.o \ ++lapacke_sgbsvxx_work.o \ ++lapacke_zposvxx.o \ ++lapacke_zposvxx_work.o \ ++lapacke_chesvxx.o \ ++lapacke_chesvxx_work.o \ ++lapacke_cposvxx.o \ ++lapacke_cposvxx_work.o \ ++lapacke_cgesvxx.o \ ++lapacke_cgesvxx_work.o \ ++lapacke_ssyrfsx.o \ ++lapacke_ssyrfsx_work.o \ ++lapacke_csyrfsx.o \ ++lapacke_csyrfsx_work.o \ ++lapacke_dsysvxx.o \ ++lapacke_dsysvxx_work.o \ ++lapacke_sporfsx.o \ ++lapacke_sporfsx_work.o \ ++lapacke_zherfsx.o \ ++lapacke_zherfsx_work.o \ ++lapacke_csysvxx.o \ ++lapacke_csysvxx_work.o \ ++lapacke_dposvxx.o \ ++lapacke_dposvxx_work.o \ ++lapacke_cgerfsx.o \ ++lapacke_cgerfsx_work.o \ ++lapacke_zgbsvxx.o \ ++lapacke_zgbsvxx_work.o \ ++lapacke_zsysvxx.o \ ++lapacke_zsysvxx_work.o \ ++lapacke_dgesvxx.o \ ++lapacke_dgesvxx_work.o \ ++lapacke_dgerfsx.o \ ++lapacke_dgerfsx_work.o \ ++lapacke_dsyrfsx.o \ ++lapacke_dsyrfsx_work.o \ ++lapacke_claghe.o \ ++lapacke_claghe_work.o \ ++lapacke_zhesvxx.o \ ++lapacke_zhesvxx_work.o \ ++lapacke_cgbrfsx.o \ ++lapacke_cgbrfsx_work.o \ ++lapacke_dgbsvxx.o \ ++lapacke_dgbsvxx_work.o ++ ++LAPACKE_TESTING_OBJS := \ ++ lapacke_slagge.o lapacke_slagge_work.o lapacke_clagge.o lapacke_clagge_work.o \ ++ lapacke_clatms.o lapacke_clatms_work.o lapacke_slatms.o lapacke_slatms_work.o lapacke_zlatms.o lapacke_zlatms_work.o \ ++ lapacke_clagsy.o lapacke_clagsy_work.o lapacke_slagsy.o lapacke_slagsy_work.o \ ++ lapacke_zlagsy.o lapacke_zlagsy_work.o lapacke_zlagge.o lapacke_zlagge_work.o \ ++ lapacke_dlatms.o lapacke_dlatms_work.o lapacke_zlaghe.o lapacke_zlaghe_work.o \ ++ lapacke_dlagsy.o lapacke_dlagsy_work.o lapacke_dlagge.o lapacke_dlagge_work.o ++ ++ ++OBJ_FILES := $(LAPACKE_OBJS) ++ ++ifdef LAPACKE_EXTENDED ++OBJ_FILES += $(LAPACKE_EXTENDEDPRECSION_OBJS) ++endif ++ ++ifdef LAPACKE_TESTING ++OBJ_FILES += $(LAPACK_TESTING_OBJS) ++endif + + all: lib + + lib: $(OBJ_FILES) +- $(ARCH) $(ARCHFLAGS) ../$(LAPACKE) $(OBJ_FILES) ++# http://hackage.haskell.org/trac/gtk2hs/ticket/1146 ++ echo $(OBJ_FILES) | xargs --max-args=100 $(ARCH) $(ARCHFLAGS) ../$(LAPACKE) + $(RANLIB) ../$(LAPACKE) + + .c.o: diff --git a/patch.for_lapack-3.4.1 b/patch.for_lapack-3.4.1 new file mode 100644 index 000000000..ff4954b09 --- /dev/null +++ b/patch.for_lapack-3.4.1 @@ -0,0 +1,932 @@ +diff -ruN lapack-3.4.1.old/INSTALL/Makefile lapack-3.4.1/INSTALL/Makefile +--- lapack-3.4.1.old/INSTALL/Makefile 2011-10-01 04:37:03 +0200 ++++ lapack-3.4.1/INSTALL/Makefile 2012-04-22 21:48:48 +0200 +@@ -27,7 +27,7 @@ + $(LOADER) $(LOADOPTS) -o testversion ilaver.o LAPACK_version.o + + clean: +- rm -f *.o ++ rm -f *.o test* + .f.o: + $(FORTRAN) $(OPTS) -c $< -o $@ + +diff -ruN lapack-3.4.1.old/Makefile lapack-3.4.1/Makefile +--- lapack-3.4.1.old/Makefile 2012-04-13 20:13:07 +0200 ++++ lapack-3.4.1/Makefile 2012-04-22 21:48:07 +0200 +@@ -20,9 +20,12 @@ + blaslib: + ( cd BLAS/SRC; $(MAKE) ) + +-lapacklib: lapack_install ++lapacklib: + ( cd SRC; $(MAKE) ) + ++lapack_prof: ++ ( cd SRC; $(MAKE) lapack_prof) ++ + lapackelib: lapacklib + ( cd lapacke; $(MAKE) ) + +diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile +--- lapack-3.4.1.old/SRC/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.1/SRC/Makefile 2012-04-22 21:40:21 +0200 +@@ -54,363 +54,371 @@ + # + ####################################################################### + +-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o \ +- ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ +- ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o ++ALLAUX = ilaenv.$(SUFFIX) ieeeck.$(SUFFIX) lsamen.$(SUFFIX) xerbla_array.$(SUFFIX) iparmq.$(SUFFIX) \ ++ ilaprec.$(SUFFIX) ilatrans.$(SUFFIX) ilauplo.$(SUFFIX) iladiag.$(SUFFIX) chla_transtype.$(SUFFIX) \ ++ ../INSTALL/ilaver.$(SUFFIX) + + SCLAUX = \ +- sbdsdc.o \ +- sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ +- slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ +- slaed7.o slaed8.o slaed9.o slaeda.o slaev2.o slagtf.o \ +- slagts.o slamrg.o slanst.o \ +- slapy2.o slapy3.o slarnv.o \ +- slarra.o slarrb.o slarrc.o slarrd.o slarre.o slarrf.o slarrj.o \ +- slarrk.o slarrr.o slaneg.o \ +- slartg.o slaruv.o slas2.o slascl.o \ +- slasd0.o slasd1.o slasd2.o slasd3.o slasd4.o slasd5.o slasd6.o \ +- slasd7.o slasd8.o slasda.o slasdq.o slasdt.o \ +- slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ +- slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ +- ssteqr.o ssterf.o slaisnan.o sisnan.o \ +- slartgp.o slartgs.o \ +- ../INSTALL/second_$(TIMER).o ++ sbdsdc.$(SUFFIX) \ ++ sbdsqr.$(SUFFIX) sdisna.$(SUFFIX) slabad.$(SUFFIX) slacpy.$(SUFFIX) sladiv.$(SUFFIX) slae2.$(SUFFIX) slaebz.$(SUFFIX) \ ++ slaed0.$(SUFFIX) slaed1.$(SUFFIX) slaed2.$(SUFFIX) slaed3.$(SUFFIX) slaed4.$(SUFFIX) slaed5.$(SUFFIX) slaed6.$(SUFFIX) \ ++ slaed7.$(SUFFIX) slaed8.$(SUFFIX) slaed9.$(SUFFIX) slaeda.$(SUFFIX) slaev2.$(SUFFIX) slagtf.$(SUFFIX) \ ++ slagts.$(SUFFIX) slamrg.$(SUFFIX) slanst.$(SUFFIX) \ ++ slapy2.$(SUFFIX) slapy3.$(SUFFIX) slarnv.$(SUFFIX) \ ++ slarra.$(SUFFIX) slarrb.$(SUFFIX) slarrc.$(SUFFIX) slarrd.$(SUFFIX) slarre.$(SUFFIX) slarrf.$(SUFFIX) slarrj.$(SUFFIX) \ ++ slarrk.$(SUFFIX) slarrr.$(SUFFIX) slaneg.$(SUFFIX) \ ++ slartg.$(SUFFIX) slaruv.$(SUFFIX) slas2.$(SUFFIX) slascl.$(SUFFIX) \ ++ slasd0.$(SUFFIX) slasd1.$(SUFFIX) slasd2.$(SUFFIX) slasd3.$(SUFFIX) slasd4.$(SUFFIX) slasd5.$(SUFFIX) slasd6.$(SUFFIX) \ ++ slasd7.$(SUFFIX) slasd8.$(SUFFIX) slasda.$(SUFFIX) slasdq.$(SUFFIX) slasdt.$(SUFFIX) \ ++ slaset.$(SUFFIX) slasq1.$(SUFFIX) slasq2.$(SUFFIX) slasq3.$(SUFFIX) slasq4.$(SUFFIX) slasq5.$(SUFFIX) slasq6.$(SUFFIX) \ ++ slasr.$(SUFFIX) slasrt.$(SUFFIX) slassq.$(SUFFIX) slasv2.$(SUFFIX) spttrf.$(SUFFIX) sstebz.$(SUFFIX) sstedc.$(SUFFIX) \ ++ ssteqr.$(SUFFIX) ssterf.$(SUFFIX) slaisnan.$(SUFFIX) sisnan.$(SUFFIX) \ ++ slartgp.$(SUFFIX) slartgs.$(SUFFIX) \ ++ ../INSTALL/second_$(TIMER).$(SUFFIX) + + DZLAUX = \ +- dbdsdc.o \ +- dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ +- dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ +- dlaed7.o dlaed8.o dlaed9.o dlaeda.o dlaev2.o dlagtf.o \ +- dlagts.o dlamrg.o dlanst.o \ +- dlapy2.o dlapy3.o dlarnv.o \ +- dlarra.o dlarrb.o dlarrc.o dlarrd.o dlarre.o dlarrf.o dlarrj.o \ +- dlarrk.o dlarrr.o dlaneg.o \ +- dlartg.o dlaruv.o dlas2.o dlascl.o \ +- dlasd0.o dlasd1.o dlasd2.o dlasd3.o dlasd4.o dlasd5.o dlasd6.o \ +- dlasd7.o dlasd8.o dlasda.o dlasdq.o dlasdt.o \ +- dlaset.o dlasq1.o dlasq2.o dlasq3.o dlasq4.o dlasq5.o dlasq6.o \ +- dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ +- dsteqr.o dsterf.o dlaisnan.o disnan.o \ +- dlartgp.o dlartgs.o \ +- ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o ++ dbdsdc.$(SUFFIX) \ ++ dbdsqr.$(SUFFIX) ddisna.$(SUFFIX) dlabad.$(SUFFIX) dlacpy.$(SUFFIX) dladiv.$(SUFFIX) dlae2.$(SUFFIX) dlaebz.$(SUFFIX) \ ++ dlaed0.$(SUFFIX) dlaed1.$(SUFFIX) dlaed2.$(SUFFIX) dlaed3.$(SUFFIX) dlaed4.$(SUFFIX) dlaed5.$(SUFFIX) dlaed6.$(SUFFIX) \ ++ dlaed7.$(SUFFIX) dlaed8.$(SUFFIX) dlaed9.$(SUFFIX) dlaeda.$(SUFFIX) dlaev2.$(SUFFIX) dlagtf.$(SUFFIX) \ ++ dlagts.$(SUFFIX) dlamrg.$(SUFFIX) dlanst.$(SUFFIX) \ ++ dlapy2.$(SUFFIX) dlapy3.$(SUFFIX) dlarnv.$(SUFFIX) \ ++ dlarra.$(SUFFIX) dlarrb.$(SUFFIX) dlarrc.$(SUFFIX) dlarrd.$(SUFFIX) dlarre.$(SUFFIX) dlarrf.$(SUFFIX) dlarrj.$(SUFFIX) \ ++ dlarrk.$(SUFFIX) dlarrr.$(SUFFIX) dlaneg.$(SUFFIX) \ ++ dlartg.$(SUFFIX) dlaruv.$(SUFFIX) dlas2.$(SUFFIX) dlascl.$(SUFFIX) \ ++ dlasd0.$(SUFFIX) dlasd1.$(SUFFIX) dlasd2.$(SUFFIX) dlasd3.$(SUFFIX) dlasd4.$(SUFFIX) dlasd5.$(SUFFIX) dlasd6.$(SUFFIX) \ ++ dlasd7.$(SUFFIX) dlasd8.$(SUFFIX) dlasda.$(SUFFIX) dlasdq.$(SUFFIX) dlasdt.$(SUFFIX) \ ++ dlaset.$(SUFFIX) dlasq1.$(SUFFIX) dlasq2.$(SUFFIX) dlasq3.$(SUFFIX) dlasq4.$(SUFFIX) dlasq5.$(SUFFIX) dlasq6.$(SUFFIX) \ ++ dlasr.$(SUFFIX) dlasrt.$(SUFFIX) dlassq.$(SUFFIX) dlasv2.$(SUFFIX) dpttrf.$(SUFFIX) dstebz.$(SUFFIX) dstedc.$(SUFFIX) \ ++ dsteqr.$(SUFFIX) dsterf.$(SUFFIX) dlaisnan.$(SUFFIX) disnan.$(SUFFIX) \ ++ dlartgp.$(SUFFIX) dlartgs.$(SUFFIX) \ ++ ../INSTALL/dsecnd_$(TIMER).$(SUFFIX) + + SLASRC = \ +- sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ +- sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ +- sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ +- sgegs.o sgegv.o sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ +- sgels.o sgelsd.o sgelss.o sgelsx.o sgelsy.o sgeql2.o sgeqlf.o \ +- sgeqp3.o sgeqpf.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ +- sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvx.o \ +- sgetc2.o sgetf2.o sgetri.o \ +- sggbak.o sggbal.o sgges.o sggesx.o sggev.o sggevx.o \ +- sggglm.o sgghrd.o sgglse.o sggqrf.o \ +- sggrqf.o sggsvd.o sggsvp.o sgtcon.o sgtrfs.o sgtsv.o \ +- sgtsvx.o sgttrf.o sgttrs.o sgtts2.o shgeqz.o \ +- shsein.o shseqr.o slabrd.o slacon.o slacn2.o \ +- slaein.o slaexc.o slag2.o slags2.o slagtm.o slagv2.o slahqr.o \ +- slahrd.o slahr2.o slaic1.o slaln2.o slals0.o slalsa.o slalsd.o \ +- slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ +- slansy.o slantb.o slantp.o slantr.o slanv2.o \ +- slapll.o slapmt.o \ +- slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ +- slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ +- slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ +- slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slargv.o \ +- slarrv.o slartv.o \ +- slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o \ +- slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ +- slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ +- sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ +- sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ +- sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ +- sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ +- spbstf.o spbsv.o spbsvx.o \ +- spbtf2.o spbtrf.o spbtrs.o spocon.o spoequ.o sporfs.o sposv.o \ +- sposvx.o spotf2.o spotri.o spstrf.o spstf2.o \ +- sppcon.o sppequ.o \ +- spprfs.o sppsv.o sppsvx.o spptrf.o spptri.o spptrs.o sptcon.o \ +- spteqr.o sptrfs.o sptsv.o sptsvx.o spttrs.o sptts2.o srscl.o \ +- ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ +- ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ +- sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ +- ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ +- sstevx.o \ +- ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ +- ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ +- ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytri2.o ssytri2x.o \ +- ssyswapr.o ssytrs.o ssytrs2.o ssyconv.o \ +- stbcon.o \ +- stbrfs.o stbtrs.o stgevc.o stgex2.o stgexc.o stgsen.o \ +- stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ +- stptrs.o \ +- strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ +- strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ +- slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ +- stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ +- sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ +- sgeequb.o ssyequb.o spoequb.o sgbequb.o \ +- sbbcsd.o slapmr.o sorbdb.o sorcsd.o \ +- sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ +- stpqrt.o stpqrt2.o stpmqrt.o stprfb.o ++ sgbbrd.$(SUFFIX) sgbcon.$(SUFFIX) sgbequ.$(SUFFIX) sgbrfs.$(SUFFIX) sgbsv.$(SUFFIX) \ ++ sgbsvx.$(SUFFIX) sgbtf2.$(SUFFIX) sgbtrf.$(SUFFIX) sgbtrs.$(SUFFIX) sgebak.$(SUFFIX) sgebal.$(SUFFIX) sgebd2.$(SUFFIX) \ ++ sgebrd.$(SUFFIX) sgecon.$(SUFFIX) sgeequ.$(SUFFIX) sgees.$(SUFFIX) sgeesx.$(SUFFIX) sgeev.$(SUFFIX) sgeevx.$(SUFFIX) \ ++ sgegs.$(SUFFIX) sgegv.$(SUFFIX) sgehd2.$(SUFFIX) sgehrd.$(SUFFIX) sgelq2.$(SUFFIX) sgelqf.$(SUFFIX) \ ++ sgels.$(SUFFIX) sgelsd.$(SUFFIX) sgelss.$(SUFFIX) sgelsx.$(SUFFIX) sgelsy.$(SUFFIX) sgeql2.$(SUFFIX) sgeqlf.$(SUFFIX) \ ++ sgeqp3.$(SUFFIX) sgeqpf.$(SUFFIX) sgeqr2.$(SUFFIX) sgeqr2p.$(SUFFIX) sgeqrf.$(SUFFIX) sgeqrfp.$(SUFFIX) sgerfs.$(SUFFIX) \ ++ sgerq2.$(SUFFIX) sgerqf.$(SUFFIX) sgesc2.$(SUFFIX) sgesdd.$(SUFFIX) sgesv.$(SUFFIX) sgesvd.$(SUFFIX) sgesvx.$(SUFFIX) \ ++ sgetc2.$(SUFFIX) sgetri.$(SUFFIX) \ ++ sggbak.$(SUFFIX) sggbal.$(SUFFIX) sgges.$(SUFFIX) sggesx.$(SUFFIX) sggev.$(SUFFIX) sggevx.$(SUFFIX) \ ++ sggglm.$(SUFFIX) sgghrd.$(SUFFIX) sgglse.$(SUFFIX) sggqrf.$(SUFFIX) \ ++ sggrqf.$(SUFFIX) sggsvd.$(SUFFIX) sggsvp.$(SUFFIX) sgtcon.$(SUFFIX) sgtrfs.$(SUFFIX) sgtsv.$(SUFFIX) \ ++ sgtsvx.$(SUFFIX) sgttrf.$(SUFFIX) sgttrs.$(SUFFIX) sgtts2.$(SUFFIX) shgeqz.$(SUFFIX) \ ++ shsein.$(SUFFIX) shseqr.$(SUFFIX) slabrd.$(SUFFIX) slacon.$(SUFFIX) slacn2.$(SUFFIX) \ ++ slaein.$(SUFFIX) slaexc.$(SUFFIX) slag2.$(SUFFIX) slags2.$(SUFFIX) slagtm.$(SUFFIX) slagv2.$(SUFFIX) slahqr.$(SUFFIX) \ ++ slahrd.$(SUFFIX) slahr2.$(SUFFIX) slaic1.$(SUFFIX) slaln2.$(SUFFIX) slals0.$(SUFFIX) slalsa.$(SUFFIX) slalsd.$(SUFFIX) \ ++ slangb.$(SUFFIX) slange.$(SUFFIX) slangt.$(SUFFIX) slanhs.$(SUFFIX) slansb.$(SUFFIX) slansp.$(SUFFIX) \ ++ slansy.$(SUFFIX) slantb.$(SUFFIX) slantp.$(SUFFIX) slantr.$(SUFFIX) slanv2.$(SUFFIX) \ ++ slapll.$(SUFFIX) slapmt.$(SUFFIX) \ ++ slaqgb.$(SUFFIX) slaqge.$(SUFFIX) slaqp2.$(SUFFIX) slaqps.$(SUFFIX) slaqsb.$(SUFFIX) slaqsp.$(SUFFIX) slaqsy.$(SUFFIX) \ ++ slaqr0.$(SUFFIX) slaqr1.$(SUFFIX) slaqr2.$(SUFFIX) slaqr3.$(SUFFIX) slaqr4.$(SUFFIX) slaqr5.$(SUFFIX) \ ++ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ ++ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ ++ slarrv.$(SUFFIX) slartv.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ ++ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ ++ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ ++ sorgrq.$(SUFFIX) sorgtr.$(SUFFIX) sorm2l.$(SUFFIX) sorm2r.$(SUFFIX) \ ++ sormbr.$(SUFFIX) sormhr.$(SUFFIX) sorml2.$(SUFFIX) sormlq.$(SUFFIX) sormql.$(SUFFIX) sormqr.$(SUFFIX) sormr2.$(SUFFIX) \ ++ sormr3.$(SUFFIX) sormrq.$(SUFFIX) sormrz.$(SUFFIX) sormtr.$(SUFFIX) spbcon.$(SUFFIX) spbequ.$(SUFFIX) spbrfs.$(SUFFIX) \ ++ spbstf.$(SUFFIX) spbsv.$(SUFFIX) spbsvx.$(SUFFIX) \ ++ spbtf2.$(SUFFIX) spbtrf.$(SUFFIX) spbtrs.$(SUFFIX) spocon.$(SUFFIX) spoequ.$(SUFFIX) sporfs.$(SUFFIX) sposv.$(SUFFIX) \ ++ sposvx.$(SUFFIX) spotri.$(SUFFIX) spstrf.$(SUFFIX) spstf2.$(SUFFIX) \ ++ sppcon.$(SUFFIX) sppequ.$(SUFFIX) \ ++ spprfs.$(SUFFIX) sppsv.$(SUFFIX) sppsvx.$(SUFFIX) spptrf.$(SUFFIX) spptri.$(SUFFIX) spptrs.$(SUFFIX) sptcon.$(SUFFIX) \ ++ spteqr.$(SUFFIX) sptrfs.$(SUFFIX) sptsv.$(SUFFIX) sptsvx.$(SUFFIX) spttrs.$(SUFFIX) sptts2.$(SUFFIX) srscl.$(SUFFIX) \ ++ ssbev.$(SUFFIX) ssbevd.$(SUFFIX) ssbevx.$(SUFFIX) ssbgst.$(SUFFIX) ssbgv.$(SUFFIX) ssbgvd.$(SUFFIX) ssbgvx.$(SUFFIX) \ ++ ssbtrd.$(SUFFIX) sspcon.$(SUFFIX) sspev.$(SUFFIX) sspevd.$(SUFFIX) sspevx.$(SUFFIX) sspgst.$(SUFFIX) \ ++ sspgv.$(SUFFIX) sspgvd.$(SUFFIX) sspgvx.$(SUFFIX) ssprfs.$(SUFFIX) sspsv.$(SUFFIX) sspsvx.$(SUFFIX) ssptrd.$(SUFFIX) \ ++ ssptrf.$(SUFFIX) ssptri.$(SUFFIX) ssptrs.$(SUFFIX) sstegr.$(SUFFIX) sstein.$(SUFFIX) sstev.$(SUFFIX) sstevd.$(SUFFIX) sstevr.$(SUFFIX) \ ++ sstevx.$(SUFFIX) \ ++ ssycon.$(SUFFIX) ssyev.$(SUFFIX) ssyevd.$(SUFFIX) ssyevr.$(SUFFIX) ssyevx.$(SUFFIX) ssygs2.$(SUFFIX) \ ++ ssygst.$(SUFFIX) ssygv.$(SUFFIX) ssygvd.$(SUFFIX) ssygvx.$(SUFFIX) ssyrfs.$(SUFFIX) ssysv.$(SUFFIX) ssysvx.$(SUFFIX) \ ++ ssytd2.$(SUFFIX) ssytf2.$(SUFFIX) ssytrd.$(SUFFIX) ssytrf.$(SUFFIX) ssytri.$(SUFFIX) ssytri2.$(SUFFIX) ssytri2x.$(SUFFIX) \ ++ ssyswapr.$(SUFFIX) ssytrs.$(SUFFIX) ssytrs2.$(SUFFIX) ssyconv.$(SUFFIX) \ ++ stbcon.$(SUFFIX) \ ++ stbrfs.$(SUFFIX) stbtrs.$(SUFFIX) stgevc.$(SUFFIX) stgex2.$(SUFFIX) stgexc.$(SUFFIX) stgsen.$(SUFFIX) \ ++ stgsja.$(SUFFIX) stgsna.$(SUFFIX) stgsy2.$(SUFFIX) stgsyl.$(SUFFIX) stpcon.$(SUFFIX) stprfs.$(SUFFIX) stptri.$(SUFFIX) \ ++ stptrs.$(SUFFIX) \ ++ strcon.$(SUFFIX) strevc.$(SUFFIX) strexc.$(SUFFIX) strrfs.$(SUFFIX) strsen.$(SUFFIX) strsna.$(SUFFIX) strsyl.$(SUFFIX) \ ++ strtrs.$(SUFFIX) stzrqf.$(SUFFIX) stzrzf.$(SUFFIX) sstemr.$(SUFFIX) \ ++ slansf.$(SUFFIX) spftrf.$(SUFFIX) spftri.$(SUFFIX) spftrs.$(SUFFIX) ssfrk.$(SUFFIX) stfsm.$(SUFFIX) stftri.$(SUFFIX) stfttp.$(SUFFIX) \ ++ stfttr.$(SUFFIX) stpttf.$(SUFFIX) stpttr.$(SUFFIX) strttf.$(SUFFIX) strttp.$(SUFFIX) \ ++ sgejsv.$(SUFFIX) sgesvj.$(SUFFIX) sgsvj0.$(SUFFIX) sgsvj1.$(SUFFIX) \ ++ sgeequb.$(SUFFIX) ssyequb.$(SUFFIX) spoequb.$(SUFFIX) sgbequb.$(SUFFIX) \ ++ sbbcsd.$(SUFFIX) slapmr.$(SUFFIX) sorbdb.$(SUFFIX) sorcsd.$(SUFFIX) \ ++ sgeqrt.$(SUFFIX) sgeqrt2.$(SUFFIX) sgeqrt3.$(SUFFIX) sgemqrt.$(SUFFIX) \ ++ stpqrt.$(SUFFIX) stpqrt2.$(SUFFIX) stpmqrt.$(SUFFIX) stprfb.$(SUFFIX) + +-DSLASRC = spotrs.o sgetrs.o spotrf.o sgetrf.o ++DSLASRC = spotrs.$(SUFFIX) + + ifdef USEXBLAS +-SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ +- sla_gercond.o sla_gerpvgrw.o ssysvxx.o ssyrfsx.o \ +- sla_syrfsx_extended.o sla_syamv.o sla_syrcond.o sla_syrpvgrw.o \ +- sposvxx.o sporfsx.o sla_porfsx_extended.o sla_porcond.o \ +- sla_porpvgrw.o sgbsvxx.o sgbrfsx.o sla_gbrfsx_extended.o \ +- sla_gbamv.o sla_gbrcond.o sla_gbrpvgrw.o sla_lin_berr.o slarscl2.o \ +- slascl2.o sla_wwaddw.o ++SXLASRC = sgesvxx.$(SUFFIX) sgerfsx.$(SUFFIX) sla_gerfsx_extended.$(SUFFIX) sla_geamv.$(SUFFIX) \ ++ sla_gercond.$(SUFFIX) sla_gerpvgrw.$(SUFFIX) ssysvxx.$(SUFFIX) ssyrfsx.$(SUFFIX) \ ++ sla_syrfsx_extended.$(SUFFIX) sla_syamv.$(SUFFIX) sla_syrcond.$(SUFFIX) sla_syrpvgrw.$(SUFFIX) \ ++ sposvxx.$(SUFFIX) sporfsx.$(SUFFIX) sla_porfsx_extended.$(SUFFIX) sla_porcond.$(SUFFIX) \ ++ sla_porpvgrw.$(SUFFIX) sgbsvxx.$(SUFFIX) sgbrfsx.$(SUFFIX) sla_gbrfsx_extended.$(SUFFIX) \ ++ sla_gbamv.$(SUFFIX) sla_gbrcond.$(SUFFIX) sla_gbrpvgrw.$(SUFFIX) sla_lin_berr.$(SUFFIX) slarscl2.$(SUFFIX) \ ++ slascl2.$(SUFFIX) sla_wwaddw.$(SUFFIX) + endif + + CLASRC = \ +- cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ +- cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ +- cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ +- cgegs.o cgegv.o cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ +- cgels.o cgelsd.o cgelss.o cgelsx.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ +- cgeqpf.o cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \ +- cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o cgesvd.o \ +- cgesvx.o cgetc2.o cgetf2.o cgetri.o \ +- cggbak.o cggbal.o cgges.o cggesx.o cggev.o cggevx.o cggglm.o \ +- cgghrd.o cgglse.o cggqrf.o cggrqf.o \ +- cggsvd.o cggsvp.o \ +- cgtcon.o cgtrfs.o cgtsv.o cgtsvx.o cgttrf.o cgttrs.o cgtts2.o chbev.o \ +- chbevd.o chbevx.o chbgst.o chbgv.o chbgvd.o chbgvx.o chbtrd.o \ +- checon.o cheev.o cheevd.o cheevr.o cheevx.o chegs2.o chegst.o \ +- chegv.o chegvd.o chegvx.o cherfs.o chesv.o chesvx.o chetd2.o \ +- chetf2.o chetrd.o \ +- chetrf.o chetri.o chetri2.o chetri2x.o cheswapr.o \ +- chetrs.o chetrs2.o chgeqz.o chpcon.o chpev.o chpevd.o \ +- chpevx.o chpgst.o chpgv.o chpgvd.o chpgvx.o chprfs.o chpsv.o \ +- chpsvx.o \ +- chptrd.o chptrf.o chptri.o chptrs.o chsein.o chseqr.o clabrd.o \ +- clacgv.o clacon.o clacn2.o clacp2.o clacpy.o clacrm.o clacrt.o cladiv.o \ +- claed0.o claed7.o claed8.o \ +- claein.o claesy.o claev2.o clags2.o clagtm.o \ +- clahef.o clahqr.o \ +- clahrd.o clahr2.o claic1.o clals0.o clalsa.o clalsd.o clangb.o clange.o clangt.o \ +- clanhb.o clanhe.o \ +- clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ +- clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ +- claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ +- claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ +- claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ +- clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ +- clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ +- clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ +- claswp.o clasyf.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ +- clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ +- cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ +- cposv.o cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \ +- cppcon.o cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ +- cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ +- crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ +- cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \ +- cstegr.o cstein.o csteqr.o \ +- csycon.o csymv.o \ +- csyr.o csyrfs.o csysv.o csysvx.o csytf2.o csytrf.o csytri.o csytri2.o csytri2x.o \ +- csyswapr.o csytrs.o csytrs2.o csyconv.o \ +- ctbcon.o ctbrfs.o ctbtrs.o ctgevc.o ctgex2.o \ +- ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ +- ctprfs.o ctptri.o \ +- ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ +- ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ +- cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ +- cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ +- cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ +- cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ +- chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ +- ctfttr.o ctpttf.o ctpttr.o ctrttf.o ctrttp.o \ +- cgeequb.o cgbequb.o csyequb.o cpoequb.o cheequb.o \ +- cbbcsd.o clapmr.o cunbdb.o cuncsd.o \ +- cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \ +- ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o ++ cbdsqr.$(SUFFIX) cgbbrd.$(SUFFIX) cgbcon.$(SUFFIX) cgbequ.$(SUFFIX) cgbrfs.$(SUFFIX) cgbsv.$(SUFFIX) cgbsvx.$(SUFFIX) \ ++ cgbtf2.$(SUFFIX) cgbtrf.$(SUFFIX) cgbtrs.$(SUFFIX) cgebak.$(SUFFIX) cgebal.$(SUFFIX) cgebd2.$(SUFFIX) cgebrd.$(SUFFIX) \ ++ cgecon.$(SUFFIX) cgeequ.$(SUFFIX) cgees.$(SUFFIX) cgeesx.$(SUFFIX) cgeev.$(SUFFIX) cgeevx.$(SUFFIX) \ ++ cgegs.$(SUFFIX) cgegv.$(SUFFIX) cgehd2.$(SUFFIX) cgehrd.$(SUFFIX) cgelq2.$(SUFFIX) cgelqf.$(SUFFIX) \ ++ cgels.$(SUFFIX) cgelsd.$(SUFFIX) cgelss.$(SUFFIX) cgelsx.$(SUFFIX) cgelsy.$(SUFFIX) cgeql2.$(SUFFIX) cgeqlf.$(SUFFIX) cgeqp3.$(SUFFIX) \ ++ cgeqpf.$(SUFFIX) cgeqr2.$(SUFFIX) cgeqr2p.$(SUFFIX) cgeqrf.$(SUFFIX) cgeqrfp.$(SUFFIX) cgerfs.$(SUFFIX) \ ++ cgerq2.$(SUFFIX) cgerqf.$(SUFFIX) cgesc2.$(SUFFIX) cgesdd.$(SUFFIX) cgesv.$(SUFFIX) cgesvd.$(SUFFIX) \ ++ cgesvx.$(SUFFIX) cgetc2.$(SUFFIX) cgetri.$(SUFFIX) \ ++ cggbak.$(SUFFIX) cggbal.$(SUFFIX) cgges.$(SUFFIX) cggesx.$(SUFFIX) cggev.$(SUFFIX) cggevx.$(SUFFIX) cggglm.$(SUFFIX) \ ++ cgghrd.$(SUFFIX) cgglse.$(SUFFIX) cggqrf.$(SUFFIX) cggrqf.$(SUFFIX) \ ++ cggsvd.$(SUFFIX) cggsvp.$(SUFFIX) \ ++ cgtcon.$(SUFFIX) cgtrfs.$(SUFFIX) cgtsv.$(SUFFIX) cgtsvx.$(SUFFIX) cgttrf.$(SUFFIX) cgttrs.$(SUFFIX) cgtts2.$(SUFFIX) chbev.$(SUFFIX) \ ++ chbevd.$(SUFFIX) chbevx.$(SUFFIX) chbgst.$(SUFFIX) chbgv.$(SUFFIX) chbgvd.$(SUFFIX) chbgvx.$(SUFFIX) chbtrd.$(SUFFIX) \ ++ checon.$(SUFFIX) cheev.$(SUFFIX) cheevd.$(SUFFIX) cheevr.$(SUFFIX) cheevx.$(SUFFIX) chegs2.$(SUFFIX) chegst.$(SUFFIX) \ ++ chegv.$(SUFFIX) chegvd.$(SUFFIX) chegvx.$(SUFFIX) cherfs.$(SUFFIX) chesv.$(SUFFIX) chesvx.$(SUFFIX) chetd2.$(SUFFIX) \ ++ chetf2.$(SUFFIX) chetrd.$(SUFFIX) \ ++ chetrf.$(SUFFIX) chetri.$(SUFFIX) chetri2.$(SUFFIX) chetri2x.$(SUFFIX) cheswapr.$(SUFFIX) \ ++ chetrs.$(SUFFIX) chetrs2.$(SUFFIX) chgeqz.$(SUFFIX) chpcon.$(SUFFIX) chpev.$(SUFFIX) chpevd.$(SUFFIX) \ ++ chpevx.$(SUFFIX) chpgst.$(SUFFIX) chpgv.$(SUFFIX) chpgvd.$(SUFFIX) chpgvx.$(SUFFIX) chprfs.$(SUFFIX) chpsv.$(SUFFIX) \ ++ chpsvx.$(SUFFIX) \ ++ chptrd.$(SUFFIX) chptrf.$(SUFFIX) chptri.$(SUFFIX) chptrs.$(SUFFIX) chsein.$(SUFFIX) chseqr.$(SUFFIX) clabrd.$(SUFFIX) \ ++ clacgv.$(SUFFIX) clacon.$(SUFFIX) clacn2.$(SUFFIX) clacp2.$(SUFFIX) clacpy.$(SUFFIX) clacrm.$(SUFFIX) clacrt.$(SUFFIX) cladiv.$(SUFFIX) \ ++ claed0.$(SUFFIX) claed7.$(SUFFIX) claed8.$(SUFFIX) \ ++ claein.$(SUFFIX) claesy.$(SUFFIX) claev2.$(SUFFIX) clags2.$(SUFFIX) clagtm.$(SUFFIX) \ ++ clahef.$(SUFFIX) clahqr.$(SUFFIX) \ ++ clahrd.$(SUFFIX) clahr2.$(SUFFIX) claic1.$(SUFFIX) clals0.$(SUFFIX) clalsa.$(SUFFIX) clalsd.$(SUFFIX) clangb.$(SUFFIX) clange.$(SUFFIX) clangt.$(SUFFIX) \ ++ clanhb.$(SUFFIX) clanhe.$(SUFFIX) \ ++ clanhp.$(SUFFIX) clanhs.$(SUFFIX) clanht.$(SUFFIX) clansb.$(SUFFIX) clansp.$(SUFFIX) clansy.$(SUFFIX) clantb.$(SUFFIX) \ ++ clantp.$(SUFFIX) clantr.$(SUFFIX) clapll.$(SUFFIX) clapmt.$(SUFFIX) clarcm.$(SUFFIX) claqgb.$(SUFFIX) claqge.$(SUFFIX) \ ++ claqhb.$(SUFFIX) claqhe.$(SUFFIX) claqhp.$(SUFFIX) claqp2.$(SUFFIX) claqps.$(SUFFIX) claqsb.$(SUFFIX) \ ++ claqr0.$(SUFFIX) claqr1.$(SUFFIX) claqr2.$(SUFFIX) claqr3.$(SUFFIX) claqr4.$(SUFFIX) claqr5.$(SUFFIX) \ ++ claqsp.$(SUFFIX) claqsy.$(SUFFIX) clar1v.$(SUFFIX) clar2v.$(SUFFIX) ilaclr.$(SUFFIX) ilaclc.$(SUFFIX) \ ++ clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ ++ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ ++ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ ++ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ ++ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ ++ cppcon.$(SUFFIX) cppequ.$(SUFFIX) cpprfs.$(SUFFIX) cppsv.$(SUFFIX) cppsvx.$(SUFFIX) cpptrf.$(SUFFIX) cpptri.$(SUFFIX) cpptrs.$(SUFFIX) \ ++ cptcon.$(SUFFIX) cpteqr.$(SUFFIX) cptrfs.$(SUFFIX) cptsv.$(SUFFIX) cptsvx.$(SUFFIX) cpttrf.$(SUFFIX) cpttrs.$(SUFFIX) cptts2.$(SUFFIX) \ ++ crot.$(SUFFIX) cspcon.$(SUFFIX) cspmv.$(SUFFIX) cspr.$(SUFFIX) csprfs.$(SUFFIX) cspsv.$(SUFFIX) \ ++ cspsvx.$(SUFFIX) csptrf.$(SUFFIX) csptri.$(SUFFIX) csptrs.$(SUFFIX) csrscl.$(SUFFIX) cstedc.$(SUFFIX) \ ++ cstegr.$(SUFFIX) cstein.$(SUFFIX) csteqr.$(SUFFIX) \ ++ csycon.$(SUFFIX) csymv.$(SUFFIX) \ ++ csyr.$(SUFFIX) csyrfs.$(SUFFIX) csysv.$(SUFFIX) csysvx.$(SUFFIX) csytf2.$(SUFFIX) csytrf.$(SUFFIX) csytri.$(SUFFIX) csytri2.$(SUFFIX) csytri2x.$(SUFFIX) \ ++ csyswapr.$(SUFFIX) csytrs.$(SUFFIX) csytrs2.$(SUFFIX) csyconv.$(SUFFIX) \ ++ ctbcon.$(SUFFIX) ctbrfs.$(SUFFIX) ctbtrs.$(SUFFIX) ctgevc.$(SUFFIX) ctgex2.$(SUFFIX) \ ++ ctgexc.$(SUFFIX) ctgsen.$(SUFFIX) ctgsja.$(SUFFIX) ctgsna.$(SUFFIX) ctgsy2.$(SUFFIX) ctgsyl.$(SUFFIX) ctpcon.$(SUFFIX) \ ++ ctprfs.$(SUFFIX) ctptri.$(SUFFIX) \ ++ ctptrs.$(SUFFIX) ctrcon.$(SUFFIX) ctrevc.$(SUFFIX) ctrexc.$(SUFFIX) ctrrfs.$(SUFFIX) ctrsen.$(SUFFIX) ctrsna.$(SUFFIX) \ ++ ctrsyl.$(SUFFIX) ctrtrs.$(SUFFIX) ctzrqf.$(SUFFIX) ctzrzf.$(SUFFIX) cung2l.$(SUFFIX) cung2r.$(SUFFIX) \ ++ cungbr.$(SUFFIX) cunghr.$(SUFFIX) cungl2.$(SUFFIX) cunglq.$(SUFFIX) cungql.$(SUFFIX) cungqr.$(SUFFIX) cungr2.$(SUFFIX) \ ++ cungrq.$(SUFFIX) cungtr.$(SUFFIX) cunm2l.$(SUFFIX) cunm2r.$(SUFFIX) cunmbr.$(SUFFIX) cunmhr.$(SUFFIX) cunml2.$(SUFFIX) \ ++ cunmlq.$(SUFFIX) cunmql.$(SUFFIX) cunmqr.$(SUFFIX) cunmr2.$(SUFFIX) cunmr3.$(SUFFIX) cunmrq.$(SUFFIX) cunmrz.$(SUFFIX) \ ++ cunmtr.$(SUFFIX) cupgtr.$(SUFFIX) cupmtr.$(SUFFIX) icmax1.$(SUFFIX) scsum1.$(SUFFIX) cstemr.$(SUFFIX) \ ++ chfrk.$(SUFFIX) ctfttp.$(SUFFIX) clanhf.$(SUFFIX) cpftrf.$(SUFFIX) cpftri.$(SUFFIX) cpftrs.$(SUFFIX) ctfsm.$(SUFFIX) ctftri.$(SUFFIX) \ ++ ctfttr.$(SUFFIX) ctpttf.$(SUFFIX) ctpttr.$(SUFFIX) ctrttf.$(SUFFIX) ctrttp.$(SUFFIX) \ ++ cgeequb.$(SUFFIX) cgbequb.$(SUFFIX) csyequb.$(SUFFIX) cpoequb.$(SUFFIX) cheequb.$(SUFFIX) \ ++ cbbcsd.$(SUFFIX) clapmr.$(SUFFIX) cunbdb.$(SUFFIX) cuncsd.$(SUFFIX) \ ++ cgeqrt.$(SUFFIX) cgeqrt2.$(SUFFIX) cgeqrt3.$(SUFFIX) cgemqrt.$(SUFFIX) \ ++ ctpqrt.$(SUFFIX) ctpqrt2.$(SUFFIX) ctpmqrt.$(SUFFIX) ctprfb.$(SUFFIX) + + ifdef USEXBLAS +-CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ +- cla_gercond_c.o cla_gercond_x.o cla_gerpvgrw.o \ +- csysvxx.o csyrfsx.o cla_syrfsx_extended.o cla_syamv.o \ +- cla_syrcond_c.o cla_syrcond_x.o cla_syrpvgrw.o \ +- cposvxx.o cporfsx.o cla_porfsx_extended.o \ +- cla_porcond_c.o cla_porcond_x.o cla_porpvgrw.o \ +- cgbsvxx.o cgbrfsx.o cla_gbrfsx_extended.o cla_gbamv.o \ +- cla_gbrcond_c.o cla_gbrcond_x.o cla_gbrpvgrw.o \ +- chesvxx.o cherfsx.o cla_herfsx_extended.o cla_heamv.o \ +- cla_hercond_c.o cla_hercond_x.o cla_herpvgrw.o \ +- cla_lin_berr.o clarscl2.o clascl2.o cla_wwaddw.o ++CXLASRC = cgesvxx.$(SUFFIX) cgerfsx.$(SUFFIX) cla_gerfsx_extended.$(SUFFIX) cla_geamv.$(SUFFIX) \ ++ cla_gercond_c.$(SUFFIX) cla_gercond_x.$(SUFFIX) cla_gerpvgrw.$(SUFFIX) \ ++ csysvxx.$(SUFFIX) csyrfsx.$(SUFFIX) cla_syrfsx_extended.$(SUFFIX) cla_syamv.$(SUFFIX) \ ++ cla_syrcond_c.$(SUFFIX) cla_syrcond_x.$(SUFFIX) cla_syrpvgrw.$(SUFFIX) \ ++ cposvxx.$(SUFFIX) cporfsx.$(SUFFIX) cla_porfsx_extended.$(SUFFIX) \ ++ cla_porcond_c.$(SUFFIX) cla_porcond_x.$(SUFFIX) cla_porpvgrw.$(SUFFIX) \ ++ cgbsvxx.$(SUFFIX) cgbrfsx.$(SUFFIX) cla_gbrfsx_extended.$(SUFFIX) cla_gbamv.$(SUFFIX) \ ++ cla_gbrcond_c.$(SUFFIX) cla_gbrcond_x.$(SUFFIX) cla_gbrpvgrw.$(SUFFIX) \ ++ chesvxx.$(SUFFIX) cherfsx.$(SUFFIX) cla_herfsx_extended.$(SUFFIX) cla_heamv.$(SUFFIX) \ ++ cla_hercond_c.$(SUFFIX) cla_hercond_x.$(SUFFIX) cla_herpvgrw.$(SUFFIX) \ ++ cla_lin_berr.$(SUFFIX) clarscl2.$(SUFFIX) clascl2.$(SUFFIX) cla_wwaddw.$(SUFFIX) + endif + +-ZCLASRC = cpotrs.o cgetrs.o cpotrf.o cgetrf.o ++ZCLASRC = cpotrs.$(SUFFIX) + + DLASRC = \ +- dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ +- dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ +- dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ +- dgegs.o dgegv.o dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ +- dgels.o dgelsd.o dgelss.o dgelsx.o dgelsy.o dgeql2.o dgeqlf.o \ +- dgeqp3.o dgeqpf.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ +- dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvx.o \ +- dgetc2.o dgetf2.o dgetrf.o dgetri.o \ +- dgetrs.o dggbak.o dggbal.o dgges.o dggesx.o dggev.o dggevx.o \ +- dggglm.o dgghrd.o dgglse.o dggqrf.o \ +- dggrqf.o dggsvd.o dggsvp.o dgtcon.o dgtrfs.o dgtsv.o \ +- dgtsvx.o dgttrf.o dgttrs.o dgtts2.o dhgeqz.o \ +- dhsein.o dhseqr.o dlabrd.o dlacon.o dlacn2.o \ +- dlaein.o dlaexc.o dlag2.o dlags2.o dlagtm.o dlagv2.o dlahqr.o \ +- dlahrd.o dlahr2.o dlaic1.o dlaln2.o dlals0.o dlalsa.o dlalsd.o \ +- dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ +- dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ +- dlapll.o dlapmt.o \ +- dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ +- dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ +- dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ +- dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \ +- dlargv.o dlarrv.o dlartv.o \ +- dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o dlasyf.o \ +- dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ +- dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ +- dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ +- dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ +- dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ +- dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ +- dpbstf.o dpbsv.o dpbsvx.o \ +- dpbtf2.o dpbtrf.o dpbtrs.o dpocon.o dpoequ.o dporfs.o dposv.o \ +- dposvx.o dpotf2.o dpotrf.o dpotri.o dpotrs.o dpstrf.o dpstf2.o \ +- dppcon.o dppequ.o \ +- dpprfs.o dppsv.o dppsvx.o dpptrf.o dpptri.o dpptrs.o dptcon.o \ +- dpteqr.o dptrfs.o dptsv.o dptsvx.o dpttrs.o dptts2.o drscl.o \ +- dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ +- dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ +- dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ +- dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ +- dstevx.o \ +- dsycon.o dsyev.o dsyevd.o dsyevr.o \ +- dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ +- dsysv.o dsysvx.o \ +- dsytd2.o dsytf2.o dsytrd.o dsytrf.o dsytri.o dsytri2.o dsytri2x.o \ +- dsyswapr.o dsytrs.o dsytrs2.o dsyconv.o \ +- dtbcon.o dtbrfs.o dtbtrs.o dtgevc.o dtgex2.o dtgexc.o dtgsen.o \ +- dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ +- dtptrs.o \ +- dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ +- dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ +- dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ +- dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ +- dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ +- dgejsv.o dgesvj.o dgsvj0.o dgsvj1.o \ +- dgeequb.o dsyequb.o dpoequb.o dgbequb.o \ +- dbbcsd.o dlapmr.o dorbdb.o dorcsd.o \ +- dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \ +- dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o ++ dgbbrd.$(SUFFIX) dgbcon.$(SUFFIX) dgbequ.$(SUFFIX) dgbrfs.$(SUFFIX) dgbsv.$(SUFFIX) \ ++ dgbsvx.$(SUFFIX) dgbtf2.$(SUFFIX) dgbtrf.$(SUFFIX) dgbtrs.$(SUFFIX) dgebak.$(SUFFIX) dgebal.$(SUFFIX) dgebd2.$(SUFFIX) \ ++ dgebrd.$(SUFFIX) dgecon.$(SUFFIX) dgeequ.$(SUFFIX) dgees.$(SUFFIX) dgeesx.$(SUFFIX) dgeev.$(SUFFIX) dgeevx.$(SUFFIX) \ ++ dgegs.$(SUFFIX) dgegv.$(SUFFIX) dgehd2.$(SUFFIX) dgehrd.$(SUFFIX) dgelq2.$(SUFFIX) dgelqf.$(SUFFIX) \ ++ dgels.$(SUFFIX) dgelsd.$(SUFFIX) dgelss.$(SUFFIX) dgelsx.$(SUFFIX) dgelsy.$(SUFFIX) dgeql2.$(SUFFIX) dgeqlf.$(SUFFIX) \ ++ dgeqp3.$(SUFFIX) dgeqpf.$(SUFFIX) dgeqr2.$(SUFFIX) dgeqr2p.$(SUFFIX) dgeqrf.$(SUFFIX) dgeqrfp.$(SUFFIX) dgerfs.$(SUFFIX) \ ++ dgerq2.$(SUFFIX) dgerqf.$(SUFFIX) dgesc2.$(SUFFIX) dgesdd.$(SUFFIX) dgesv.$(SUFFIX) dgesvd.$(SUFFIX) dgesvx.$(SUFFIX) \ ++ dgetc2.$(SUFFIX) dgetri.$(SUFFIX) \ ++ dggbak.$(SUFFIX) dggbal.$(SUFFIX) dgges.$(SUFFIX) dggesx.$(SUFFIX) dggev.$(SUFFIX) dggevx.$(SUFFIX) \ ++ dggglm.$(SUFFIX) dgghrd.$(SUFFIX) dgglse.$(SUFFIX) dggqrf.$(SUFFIX) \ ++ dggrqf.$(SUFFIX) dggsvd.$(SUFFIX) dggsvp.$(SUFFIX) dgtcon.$(SUFFIX) dgtrfs.$(SUFFIX) dgtsv.$(SUFFIX) \ ++ dgtsvx.$(SUFFIX) dgttrf.$(SUFFIX) dgttrs.$(SUFFIX) dgtts2.$(SUFFIX) dhgeqz.$(SUFFIX) \ ++ dhsein.$(SUFFIX) dhseqr.$(SUFFIX) dlabrd.$(SUFFIX) dlacon.$(SUFFIX) dlacn2.$(SUFFIX) \ ++ dlaein.$(SUFFIX) dlaexc.$(SUFFIX) dlag2.$(SUFFIX) dlags2.$(SUFFIX) dlagtm.$(SUFFIX) dlagv2.$(SUFFIX) dlahqr.$(SUFFIX) \ ++ dlahrd.$(SUFFIX) dlahr2.$(SUFFIX) dlaic1.$(SUFFIX) dlaln2.$(SUFFIX) dlals0.$(SUFFIX) dlalsa.$(SUFFIX) dlalsd.$(SUFFIX) \ ++ dlangb.$(SUFFIX) dlange.$(SUFFIX) dlangt.$(SUFFIX) dlanhs.$(SUFFIX) dlansb.$(SUFFIX) dlansp.$(SUFFIX) \ ++ dlansy.$(SUFFIX) dlantb.$(SUFFIX) dlantp.$(SUFFIX) dlantr.$(SUFFIX) dlanv2.$(SUFFIX) \ ++ dlapll.$(SUFFIX) dlapmt.$(SUFFIX) \ ++ dlaqgb.$(SUFFIX) dlaqge.$(SUFFIX) dlaqp2.$(SUFFIX) dlaqps.$(SUFFIX) dlaqsb.$(SUFFIX) dlaqsp.$(SUFFIX) dlaqsy.$(SUFFIX) \ ++ dlaqr0.$(SUFFIX) dlaqr1.$(SUFFIX) dlaqr2.$(SUFFIX) dlaqr3.$(SUFFIX) dlaqr4.$(SUFFIX) dlaqr5.$(SUFFIX) \ ++ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ ++ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ ++ dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ ++ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ ++ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ ++ dorgrq.$(SUFFIX) dorgtr.$(SUFFIX) dorm2l.$(SUFFIX) dorm2r.$(SUFFIX) \ ++ dormbr.$(SUFFIX) dormhr.$(SUFFIX) dorml2.$(SUFFIX) dormlq.$(SUFFIX) dormql.$(SUFFIX) dormqr.$(SUFFIX) dormr2.$(SUFFIX) \ ++ dormr3.$(SUFFIX) dormrq.$(SUFFIX) dormrz.$(SUFFIX) dormtr.$(SUFFIX) dpbcon.$(SUFFIX) dpbequ.$(SUFFIX) dpbrfs.$(SUFFIX) \ ++ dpbstf.$(SUFFIX) dpbsv.$(SUFFIX) dpbsvx.$(SUFFIX) \ ++ dpbtf2.$(SUFFIX) dpbtrf.$(SUFFIX) dpbtrs.$(SUFFIX) dpocon.$(SUFFIX) dpoequ.$(SUFFIX) dporfs.$(SUFFIX) dposv.$(SUFFIX) \ ++ dposvx.$(SUFFIX) dpotri.$(SUFFIX) dpotrs.$(SUFFIX) dpstrf.$(SUFFIX) dpstf2.$(SUFFIX) \ ++ dppcon.$(SUFFIX) dppequ.$(SUFFIX) \ ++ dpprfs.$(SUFFIX) dppsv.$(SUFFIX) dppsvx.$(SUFFIX) dpptrf.$(SUFFIX) dpptri.$(SUFFIX) dpptrs.$(SUFFIX) dptcon.$(SUFFIX) \ ++ dpteqr.$(SUFFIX) dptrfs.$(SUFFIX) dptsv.$(SUFFIX) dptsvx.$(SUFFIX) dpttrs.$(SUFFIX) dptts2.$(SUFFIX) drscl.$(SUFFIX) \ ++ dsbev.$(SUFFIX) dsbevd.$(SUFFIX) dsbevx.$(SUFFIX) dsbgst.$(SUFFIX) dsbgv.$(SUFFIX) dsbgvd.$(SUFFIX) dsbgvx.$(SUFFIX) \ ++ dsbtrd.$(SUFFIX) dspcon.$(SUFFIX) dspev.$(SUFFIX) dspevd.$(SUFFIX) dspevx.$(SUFFIX) dspgst.$(SUFFIX) \ ++ dspgv.$(SUFFIX) dspgvd.$(SUFFIX) dspgvx.$(SUFFIX) dsprfs.$(SUFFIX) dspsv.$(SUFFIX) dspsvx.$(SUFFIX) dsptrd.$(SUFFIX) \ ++ dsptrf.$(SUFFIX) dsptri.$(SUFFIX) dsptrs.$(SUFFIX) dstegr.$(SUFFIX) dstein.$(SUFFIX) dstev.$(SUFFIX) dstevd.$(SUFFIX) dstevr.$(SUFFIX) \ ++ dstevx.$(SUFFIX) \ ++ dsycon.$(SUFFIX) dsyev.$(SUFFIX) dsyevd.$(SUFFIX) dsyevr.$(SUFFIX) \ ++ dsyevx.$(SUFFIX) dsygs2.$(SUFFIX) dsygst.$(SUFFIX) dsygv.$(SUFFIX) dsygvd.$(SUFFIX) dsygvx.$(SUFFIX) dsyrfs.$(SUFFIX) \ ++ dsysv.$(SUFFIX) dsysvx.$(SUFFIX) \ ++ dsytd2.$(SUFFIX) dsytf2.$(SUFFIX) dsytrd.$(SUFFIX) dsytrf.$(SUFFIX) dsytri.$(SUFFIX) dsytri2.$(SUFFIX) dsytri2x.$(SUFFIX) \ ++ dsyswapr.$(SUFFIX) dsytrs.$(SUFFIX) dsytrs2.$(SUFFIX) dsyconv.$(SUFFIX) \ ++ dtbcon.$(SUFFIX) dtbrfs.$(SUFFIX) dtbtrs.$(SUFFIX) dtgevc.$(SUFFIX) dtgex2.$(SUFFIX) dtgexc.$(SUFFIX) dtgsen.$(SUFFIX) \ ++ dtgsja.$(SUFFIX) dtgsna.$(SUFFIX) dtgsy2.$(SUFFIX) dtgsyl.$(SUFFIX) dtpcon.$(SUFFIX) dtprfs.$(SUFFIX) dtptri.$(SUFFIX) \ ++ dtptrs.$(SUFFIX) \ ++ dtrcon.$(SUFFIX) dtrevc.$(SUFFIX) dtrexc.$(SUFFIX) dtrrfs.$(SUFFIX) dtrsen.$(SUFFIX) dtrsna.$(SUFFIX) dtrsyl.$(SUFFIX) \ ++ dtrtrs.$(SUFFIX) dtzrqf.$(SUFFIX) dtzrzf.$(SUFFIX) dstemr.$(SUFFIX) \ ++ dsgesv.$(SUFFIX) dsposv.$(SUFFIX) dlag2s.$(SUFFIX) slag2d.$(SUFFIX) dlat2s.$(SUFFIX) \ ++ dlansf.$(SUFFIX) dpftrf.$(SUFFIX) dpftri.$(SUFFIX) dpftrs.$(SUFFIX) dsfrk.$(SUFFIX) dtfsm.$(SUFFIX) dtftri.$(SUFFIX) dtfttp.$(SUFFIX) \ ++ dtfttr.$(SUFFIX) dtpttf.$(SUFFIX) dtpttr.$(SUFFIX) dtrttf.$(SUFFIX) dtrttp.$(SUFFIX) \ ++ dgejsv.$(SUFFIX) dgesvj.$(SUFFIX) dgsvj0.$(SUFFIX) dgsvj1.$(SUFFIX) \ ++ dgeequb.$(SUFFIX) dsyequb.$(SUFFIX) dpoequb.$(SUFFIX) dgbequb.$(SUFFIX) \ ++ dbbcsd.$(SUFFIX) dlapmr.$(SUFFIX) dorbdb.$(SUFFIX) dorcsd.$(SUFFIX) \ ++ dgeqrt.$(SUFFIX) dgeqrt2.$(SUFFIX) dgeqrt3.$(SUFFIX) dgemqrt.$(SUFFIX) \ ++ dtpqrt.$(SUFFIX) dtpqrt2.$(SUFFIX) dtpmqrt.$(SUFFIX) dtprfb.$(SUFFIX) + + ifdef USEXBLAS +-DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ +- dla_gercond.o dla_gerpvgrw.o dsysvxx.o dsyrfsx.o \ +- dla_syrfsx_extended.o dla_syamv.o dla_syrcond.o dla_syrpvgrw.o \ +- dposvxx.o dporfsx.o dla_porfsx_extended.o dla_porcond.o \ +- dla_porpvgrw.o dgbsvxx.o dgbrfsx.o dla_gbrfsx_extended.o \ +- dla_gbamv.o dla_gbrcond.o dla_gbrpvgrw.o dla_lin_berr.o dlarscl2.o \ +- dlascl2.o dla_wwaddw.o ++DXLASRC = dgesvxx.$(SUFFIX) dgerfsx.$(SUFFIX) dla_gerfsx_extended.$(SUFFIX) dla_geamv.$(SUFFIX) \ ++ dla_gercond.$(SUFFIX) dla_gerpvgrw.$(SUFFIX) dsysvxx.$(SUFFIX) dsyrfsx.$(SUFFIX) \ ++ dla_syrfsx_extended.$(SUFFIX) dla_syamv.$(SUFFIX) dla_syrcond.$(SUFFIX) dla_syrpvgrw.$(SUFFIX) \ ++ dposvxx.$(SUFFIX) dporfsx.$(SUFFIX) dla_porfsx_extended.$(SUFFIX) dla_porcond.$(SUFFIX) \ ++ dla_porpvgrw.$(SUFFIX) dgbsvxx.$(SUFFIX) dgbrfsx.$(SUFFIX) dla_gbrfsx_extended.$(SUFFIX) \ ++ dla_gbamv.$(SUFFIX) dla_gbrcond.$(SUFFIX) dla_gbrpvgrw.$(SUFFIX) dla_lin_berr.$(SUFFIX) dlarscl2.$(SUFFIX) \ ++ dlascl2.$(SUFFIX) dla_wwaddw.$(SUFFIX) + endif + + ZLASRC = \ +- zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ +- zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ +- zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ +- zgegs.o zgegv.o zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ +- zgels.o zgelsd.o zgelss.o zgelsx.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ +- zgeqpf.o zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \ +- zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvx.o zgetc2.o zgetf2.o zgetrf.o \ +- zgetri.o zgetrs.o \ +- zggbak.o zggbal.o zgges.o zggesx.o zggev.o zggevx.o zggglm.o \ +- zgghrd.o zgglse.o zggqrf.o zggrqf.o \ +- zggsvd.o zggsvp.o \ +- zgtcon.o zgtrfs.o zgtsv.o zgtsvx.o zgttrf.o zgttrs.o zgtts2.o zhbev.o \ +- zhbevd.o zhbevx.o zhbgst.o zhbgv.o zhbgvd.o zhbgvx.o zhbtrd.o \ +- zhecon.o zheev.o zheevd.o zheevr.o zheevx.o zhegs2.o zhegst.o \ +- zhegv.o zhegvd.o zhegvx.o zherfs.o zhesv.o zhesvx.o zhetd2.o \ +- zhetf2.o zhetrd.o \ +- zhetrf.o zhetri.o zhetri2.o zhetri2x.o zheswapr.o \ +- zhetrs.o zhetrs2.o zhgeqz.o zhpcon.o zhpev.o zhpevd.o \ +- zhpevx.o zhpgst.o zhpgv.o zhpgvd.o zhpgvx.o zhprfs.o zhpsv.o \ +- zhpsvx.o \ +- zhptrd.o zhptrf.o zhptri.o zhptrs.o zhsein.o zhseqr.o zlabrd.o \ +- zlacgv.o zlacon.o zlacn2.o zlacp2.o zlacpy.o zlacrm.o zlacrt.o zladiv.o \ +- zlaed0.o zlaed7.o zlaed8.o \ +- zlaein.o zlaesy.o zlaev2.o zlags2.o zlagtm.o \ +- zlahef.o zlahqr.o \ +- zlahrd.o zlahr2.o zlaic1.o zlals0.o zlalsa.o zlalsd.o zlangb.o zlange.o \ +- zlangt.o zlanhb.o \ +- zlanhe.o \ +- zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ +- zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ +- zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ +- zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ +- zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ +- zlarcm.o zlarf.o zlarfb.o \ +- zlarfg.o zlarft.o zlarfgp.o \ +- zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ +- zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ +- zlassq.o zlaswp.o zlasyf.o \ +- zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ +- zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ +- zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ +- zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ +- zppcon.o zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ +- zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ +- zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ +- zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \ +- zstegr.o zstein.o zsteqr.o \ +- zsycon.o zsymv.o \ +- zsyr.o zsyrfs.o zsysv.o zsysvx.o zsytf2.o zsytrf.o zsytri.o zsytri2.o zsytri2x.o \ +- zsyswapr.o zsytrs.o zsytrs2.o zsyconv.o \ +- ztbcon.o ztbrfs.o ztbtrs.o ztgevc.o ztgex2.o \ +- ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ +- ztprfs.o ztptri.o \ +- ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ +- ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ +- zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ +- zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ +- zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ +- zunmtr.o zupgtr.o \ +- zupmtr.o izmax1.o dzsum1.o zstemr.o \ +- zcgesv.o zcposv.o zlag2c.o clag2z.o zlat2c.o \ +- zhfrk.o ztfttp.o zlanhf.o zpftrf.o zpftri.o zpftrs.o ztfsm.o ztftri.o \ +- ztfttr.o ztpttf.o ztpttr.o ztrttf.o ztrttp.o \ +- zgeequb.o zgbequb.o zsyequb.o zpoequb.o zheequb.o \ +- zbbcsd.o zlapmr.o zunbdb.o zuncsd.o \ +- zgeqrt.o zgeqrt2.o zgeqrt3.o zgemqrt.o \ +- ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o ++ zbdsqr.$(SUFFIX) zgbbrd.$(SUFFIX) zgbcon.$(SUFFIX) zgbequ.$(SUFFIX) zgbrfs.$(SUFFIX) zgbsv.$(SUFFIX) zgbsvx.$(SUFFIX) \ ++ zgbtf2.$(SUFFIX) zgbtrf.$(SUFFIX) zgbtrs.$(SUFFIX) zgebak.$(SUFFIX) zgebal.$(SUFFIX) zgebd2.$(SUFFIX) zgebrd.$(SUFFIX) \ ++ zgecon.$(SUFFIX) zgeequ.$(SUFFIX) zgees.$(SUFFIX) zgeesx.$(SUFFIX) zgeev.$(SUFFIX) zgeevx.$(SUFFIX) \ ++ zgegs.$(SUFFIX) zgegv.$(SUFFIX) zgehd2.$(SUFFIX) zgehrd.$(SUFFIX) zgelq2.$(SUFFIX) zgelqf.$(SUFFIX) \ ++ zgels.$(SUFFIX) zgelsd.$(SUFFIX) zgelss.$(SUFFIX) zgelsx.$(SUFFIX) zgelsy.$(SUFFIX) zgeql2.$(SUFFIX) zgeqlf.$(SUFFIX) zgeqp3.$(SUFFIX) \ ++ zgeqpf.$(SUFFIX) zgeqr2.$(SUFFIX) zgeqr2p.$(SUFFIX) zgeqrf.$(SUFFIX) zgeqrfp.$(SUFFIX) zgerfs.$(SUFFIX) zgerq2.$(SUFFIX) zgerqf.$(SUFFIX) \ ++ zgesc2.$(SUFFIX) zgesdd.$(SUFFIX) zgesv.$(SUFFIX) zgesvd.$(SUFFIX) zgesvx.$(SUFFIX) zgetc2.$(SUFFIX) \ ++ zgetri.$(SUFFIX) \ ++ zggbak.$(SUFFIX) zggbal.$(SUFFIX) zgges.$(SUFFIX) zggesx.$(SUFFIX) zggev.$(SUFFIX) zggevx.$(SUFFIX) zggglm.$(SUFFIX) \ ++ zgghrd.$(SUFFIX) zgglse.$(SUFFIX) zggqrf.$(SUFFIX) zggrqf.$(SUFFIX) \ ++ zggsvd.$(SUFFIX) zggsvp.$(SUFFIX) \ ++ zgtcon.$(SUFFIX) zgtrfs.$(SUFFIX) zgtsv.$(SUFFIX) zgtsvx.$(SUFFIX) zgttrf.$(SUFFIX) zgttrs.$(SUFFIX) zgtts2.$(SUFFIX) zhbev.$(SUFFIX) \ ++ zhbevd.$(SUFFIX) zhbevx.$(SUFFIX) zhbgst.$(SUFFIX) zhbgv.$(SUFFIX) zhbgvd.$(SUFFIX) zhbgvx.$(SUFFIX) zhbtrd.$(SUFFIX) \ ++ zhecon.$(SUFFIX) zheev.$(SUFFIX) zheevd.$(SUFFIX) zheevr.$(SUFFIX) zheevx.$(SUFFIX) zhegs2.$(SUFFIX) zhegst.$(SUFFIX) \ ++ zhegv.$(SUFFIX) zhegvd.$(SUFFIX) zhegvx.$(SUFFIX) zherfs.$(SUFFIX) zhesv.$(SUFFIX) zhesvx.$(SUFFIX) zhetd2.$(SUFFIX) \ ++ zhetf2.$(SUFFIX) zhetrd.$(SUFFIX) \ ++ zhetrf.$(SUFFIX) zhetri.$(SUFFIX) zhetri2.$(SUFFIX) zhetri2x.$(SUFFIX) zheswapr.$(SUFFIX) \ ++ zhetrs.$(SUFFIX) zhetrs2.$(SUFFIX) zhgeqz.$(SUFFIX) zhpcon.$(SUFFIX) zhpev.$(SUFFIX) zhpevd.$(SUFFIX) \ ++ zhpevx.$(SUFFIX) zhpgst.$(SUFFIX) zhpgv.$(SUFFIX) zhpgvd.$(SUFFIX) zhpgvx.$(SUFFIX) zhprfs.$(SUFFIX) zhpsv.$(SUFFIX) \ ++ zhpsvx.$(SUFFIX) \ ++ zhptrd.$(SUFFIX) zhptrf.$(SUFFIX) zhptri.$(SUFFIX) zhptrs.$(SUFFIX) zhsein.$(SUFFIX) zhseqr.$(SUFFIX) zlabrd.$(SUFFIX) \ ++ zlacgv.$(SUFFIX) zlacon.$(SUFFIX) zlacn2.$(SUFFIX) zlacp2.$(SUFFIX) zlacpy.$(SUFFIX) zlacrm.$(SUFFIX) zlacrt.$(SUFFIX) zladiv.$(SUFFIX) \ ++ zlaed0.$(SUFFIX) zlaed7.$(SUFFIX) zlaed8.$(SUFFIX) \ ++ zlaein.$(SUFFIX) zlaesy.$(SUFFIX) zlaev2.$(SUFFIX) zlags2.$(SUFFIX) zlagtm.$(SUFFIX) \ ++ zlahef.$(SUFFIX) zlahqr.$(SUFFIX) \ ++ zlahrd.$(SUFFIX) zlahr2.$(SUFFIX) zlaic1.$(SUFFIX) zlals0.$(SUFFIX) zlalsa.$(SUFFIX) zlalsd.$(SUFFIX) zlangb.$(SUFFIX) zlange.$(SUFFIX) \ ++ zlangt.$(SUFFIX) zlanhb.$(SUFFIX) \ ++ zlanhe.$(SUFFIX) \ ++ zlanhp.$(SUFFIX) zlanhs.$(SUFFIX) zlanht.$(SUFFIX) zlansb.$(SUFFIX) zlansp.$(SUFFIX) zlansy.$(SUFFIX) zlantb.$(SUFFIX) \ ++ zlantp.$(SUFFIX) zlantr.$(SUFFIX) zlapll.$(SUFFIX) zlapmt.$(SUFFIX) zlaqgb.$(SUFFIX) zlaqge.$(SUFFIX) \ ++ zlaqhb.$(SUFFIX) zlaqhe.$(SUFFIX) zlaqhp.$(SUFFIX) zlaqp2.$(SUFFIX) zlaqps.$(SUFFIX) zlaqsb.$(SUFFIX) \ ++ zlaqr0.$(SUFFIX) zlaqr1.$(SUFFIX) zlaqr2.$(SUFFIX) zlaqr3.$(SUFFIX) zlaqr4.$(SUFFIX) zlaqr5.$(SUFFIX) \ ++ zlaqsp.$(SUFFIX) zlaqsy.$(SUFFIX) zlar1v.$(SUFFIX) zlar2v.$(SUFFIX) ilazlr.$(SUFFIX) ilazlc.$(SUFFIX) \ ++ zlarcm.$(SUFFIX) zlarf.$(SUFFIX) zlarfb.$(SUFFIX) \ ++ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ ++ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ ++ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ ++ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ ++ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ ++ zposv.$(SUFFIX) zposvx.$(SUFFIX) zpotri.$(SUFFIX) zpotrs.$(SUFFIX) zpstrf.$(SUFFIX) zpstf2.$(SUFFIX) \ ++ zppcon.$(SUFFIX) zppequ.$(SUFFIX) zpprfs.$(SUFFIX) zppsv.$(SUFFIX) zppsvx.$(SUFFIX) zpptrf.$(SUFFIX) zpptri.$(SUFFIX) zpptrs.$(SUFFIX) \ ++ zptcon.$(SUFFIX) zpteqr.$(SUFFIX) zptrfs.$(SUFFIX) zptsv.$(SUFFIX) zptsvx.$(SUFFIX) zpttrf.$(SUFFIX) zpttrs.$(SUFFIX) zptts2.$(SUFFIX) \ ++ zrot.$(SUFFIX) zspcon.$(SUFFIX) zspmv.$(SUFFIX) zspr.$(SUFFIX) zsprfs.$(SUFFIX) zspsv.$(SUFFIX) \ ++ zspsvx.$(SUFFIX) zsptrf.$(SUFFIX) zsptri.$(SUFFIX) zsptrs.$(SUFFIX) zdrscl.$(SUFFIX) zstedc.$(SUFFIX) \ ++ zstegr.$(SUFFIX) zstein.$(SUFFIX) zsteqr.$(SUFFIX) \ ++ zsycon.$(SUFFIX) zsymv.$(SUFFIX) \ ++ zsyr.$(SUFFIX) zsyrfs.$(SUFFIX) zsysv.$(SUFFIX) zsysvx.$(SUFFIX) zsytf2.$(SUFFIX) zsytrf.$(SUFFIX) zsytri.$(SUFFIX) zsytri2.$(SUFFIX) zsytri2x.$(SUFFIX) \ ++ zsyswapr.$(SUFFIX) zsytrs.$(SUFFIX) zsytrs2.$(SUFFIX) zsyconv.$(SUFFIX) \ ++ ztbcon.$(SUFFIX) ztbrfs.$(SUFFIX) ztbtrs.$(SUFFIX) ztgevc.$(SUFFIX) ztgex2.$(SUFFIX) \ ++ ztgexc.$(SUFFIX) ztgsen.$(SUFFIX) ztgsja.$(SUFFIX) ztgsna.$(SUFFIX) ztgsy2.$(SUFFIX) ztgsyl.$(SUFFIX) ztpcon.$(SUFFIX) \ ++ ztprfs.$(SUFFIX) ztptri.$(SUFFIX) \ ++ ztptrs.$(SUFFIX) ztrcon.$(SUFFIX) ztrevc.$(SUFFIX) ztrexc.$(SUFFIX) ztrrfs.$(SUFFIX) ztrsen.$(SUFFIX) ztrsna.$(SUFFIX) \ ++ ztrsyl.$(SUFFIX) ztrtrs.$(SUFFIX) ztzrqf.$(SUFFIX) ztzrzf.$(SUFFIX) zung2l.$(SUFFIX) \ ++ zung2r.$(SUFFIX) zungbr.$(SUFFIX) zunghr.$(SUFFIX) zungl2.$(SUFFIX) zunglq.$(SUFFIX) zungql.$(SUFFIX) zungqr.$(SUFFIX) zungr2.$(SUFFIX) \ ++ zungrq.$(SUFFIX) zungtr.$(SUFFIX) zunm2l.$(SUFFIX) zunm2r.$(SUFFIX) zunmbr.$(SUFFIX) zunmhr.$(SUFFIX) zunml2.$(SUFFIX) \ ++ zunmlq.$(SUFFIX) zunmql.$(SUFFIX) zunmqr.$(SUFFIX) zunmr2.$(SUFFIX) zunmr3.$(SUFFIX) zunmrq.$(SUFFIX) zunmrz.$(SUFFIX) \ ++ zunmtr.$(SUFFIX) zupgtr.$(SUFFIX) \ ++ zupmtr.$(SUFFIX) izmax1.$(SUFFIX) dzsum1.$(SUFFIX) zstemr.$(SUFFIX) \ ++ zcgesv.$(SUFFIX) zcposv.$(SUFFIX) zlag2c.$(SUFFIX) clag2z.$(SUFFIX) zlat2c.$(SUFFIX) \ ++ zhfrk.$(SUFFIX) ztfttp.$(SUFFIX) zlanhf.$(SUFFIX) zpftrf.$(SUFFIX) zpftri.$(SUFFIX) zpftrs.$(SUFFIX) ztfsm.$(SUFFIX) ztftri.$(SUFFIX) \ ++ ztfttr.$(SUFFIX) ztpttf.$(SUFFIX) ztpttr.$(SUFFIX) ztrttf.$(SUFFIX) ztrttp.$(SUFFIX) \ ++ zgeequb.$(SUFFIX) zgbequb.$(SUFFIX) zsyequb.$(SUFFIX) zpoequb.$(SUFFIX) zheequb.$(SUFFIX) \ ++ zbbcsd.$(SUFFIX) zlapmr.$(SUFFIX) zunbdb.$(SUFFIX) zuncsd.$(SUFFIX) \ ++ zgeqrt.$(SUFFIX) zgeqrt2.$(SUFFIX) zgeqrt3.$(SUFFIX) zgemqrt.$(SUFFIX) \ ++ ztpqrt.$(SUFFIX) ztpqrt2.$(SUFFIX) ztpmqrt.$(SUFFIX) ztprfb.$(SUFFIX) + + ifdef USEXBLAS +-ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ +- zla_gercond_c.o zla_gercond_x.o zla_gerpvgrw.o zsysvxx.o zsyrfsx.o \ +- zla_syrfsx_extended.o zla_syamv.o zla_syrcond_c.o zla_syrcond_x.o \ +- zla_syrpvgrw.o zposvxx.o zporfsx.o zla_porfsx_extended.o \ +- zla_porcond_c.o zla_porcond_x.o zla_porpvgrw.o zgbsvxx.o zgbrfsx.o \ +- zla_gbrfsx_extended.o zla_gbamv.o zla_gbrcond_c.o zla_gbrcond_x.o \ +- zla_gbrpvgrw.o zhesvxx.o zherfsx.o zla_herfsx_extended.o \ +- zla_heamv.o zla_hercond_c.o zla_hercond_x.o zla_herpvgrw.o \ +- zla_lin_berr.o zlarscl2.o zlascl2.o zla_wwaddw.o ++ZXLASRC = zgesvxx.$(SUFFIX) zgerfsx.$(SUFFIX) zla_gerfsx_extended.$(SUFFIX) zla_geamv.$(SUFFIX) \ ++ zla_gercond_c.$(SUFFIX) zla_gercond_x.$(SUFFIX) zla_gerpvgrw.$(SUFFIX) zsysvxx.$(SUFFIX) zsyrfsx.$(SUFFIX) \ ++ zla_syrfsx_extended.$(SUFFIX) zla_syamv.$(SUFFIX) zla_syrcond_c.$(SUFFIX) zla_syrcond_x.$(SUFFIX) \ ++ zla_syrpvgrw.$(SUFFIX) zposvxx.$(SUFFIX) zporfsx.$(SUFFIX) zla_porfsx_extended.$(SUFFIX) \ ++ zla_porcond_c.$(SUFFIX) zla_porcond_x.$(SUFFIX) zla_porpvgrw.$(SUFFIX) zgbsvxx.$(SUFFIX) zgbrfsx.$(SUFFIX) \ ++ zla_gbrfsx_extended.$(SUFFIX) zla_gbamv.$(SUFFIX) zla_gbrcond_c.$(SUFFIX) zla_gbrcond_x.$(SUFFIX) \ ++ zla_gbrpvgrw.$(SUFFIX) zhesvxx.$(SUFFIX) zherfsx.$(SUFFIX) zla_herfsx_extended.$(SUFFIX) \ ++ zla_heamv.$(SUFFIX) zla_hercond_c.$(SUFFIX) zla_hercond_x.$(SUFFIX) zla_herpvgrw.$(SUFFIX) \ ++ zla_lin_berr.$(SUFFIX) zlarscl2.$(SUFFIX) zlascl2.$(SUFFIX) zla_wwaddw.$(SUFFIX) + endif + + ALLOBJ = $(SLASRC) $(DLASRC) $(DSLASRC) $(CLASRC) $(ZLASRC) $(ZCLASRC) \ + $(SCLAUX) $(DZLAUX) $(ALLAUX) + ++ALLOBJ_P = $(ALLOBJ:.$(SUFFIX)=.$(PSUFFIX)) ++ + ifdef USEXBLAS + ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) + endif + + all: ../$(LAPACKLIB) + ++lapack_prof: ../$(LAPACKLIB_P) ++ + ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) + $(RANLIB) $@ + ++../$(LAPACKLIB_P): $(ALLOBJ_P) ++ $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ_P) ++ $(RANLIB) $@ ++ + single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) +@@ -451,15 +459,24 @@ + @FRC=$(FRC) + + clean: +- rm -f *.o ++ rm -f *.$(SUFFIX) *.$(PSUFFIX) + +-.f.o: ++%.$(SUFFIX): %.f + $(FORTRAN) $(OPTS) -c $< -o $@ + +-slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ ++%.$(PSUFFIX): %.f ++ $(FORTRAN) $(POPTS) -c $< -o $@ + ++slaruv.$(SUFFIX): slaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dlaruv.$(SUFFIX): dlaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(SUFFIX): sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(SUFFIX): dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(SUFFIX): cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(SUFFIX): zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++ ++slaruv.$(PSUFFIX): slaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dlaruv.$(PSUFFIX): dlaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(PSUFFIX): sla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(PSUFFIX): dla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(PSUFFIX): cla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(PSUFFIX): zla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ +diff -ruN lapack-3.4.1.old/TESTING/EIG/Makefile lapack-3.4.1/TESTING/EIG/Makefile +--- lapack-3.4.1.old/TESTING/EIG/Makefile 2011-09-26 23:52:31 +0200 ++++ lapack-3.4.1/TESTING/EIG/Makefile 2012-04-22 21:41:45 +0200 +@@ -78,7 +78,7 @@ + cget35.o cget36.o cget37.o cget38.o cget51.o cget52.o \ + cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts.o \ + chbt21.o chet21.o chet22.o chpt21.o chst01.o \ +- clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ ++ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o \ + csgt01.o cslect.o \ + cstt21.o cstt22.o cunt01.o cunt03.o + +@@ -115,7 +115,7 @@ + zget35.o zget36.o zget37.o zget38.o zget51.o zget52.o \ + zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts.o \ + zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ +- zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ ++ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o \ + zsgt01.o zslect.o \ + zstt21.o zstt22.o zunt01.o zunt03.o + +@@ -129,22 +129,22 @@ + ../xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtsts \ + $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtsts $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtsts $@ + + ../xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstc \ + $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstc $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstc $@ + + ../xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstd \ + $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstd $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstd $@ + + ../xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstz \ + $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstz $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstz $@ + + $(AEIGTST): $(FRC) + $(SCIGTST): $(FRC) +diff -ruN lapack-3.4.1.old/TESTING/LIN/Makefile lapack-3.4.1/TESTING/LIN/Makefile +--- lapack-3.4.1.old/TESTING/LIN/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.1/TESTING/LIN/Makefile 2012-04-22 21:43:30 +0200 +@@ -109,7 +109,7 @@ + cqpt01.o cqrt01.o cqrt01p.o cqrt02.o cqrt03.o cqrt11.o \ + cqrt12.o cqrt13.o cqrt14.o cqrt15.o cqrt16.o \ + cqrt17.o crqt01.o crqt02.o crqt03.o crzt01.o crzt02.o \ +- csbmv.o cspt01.o \ ++ cspt01.o \ + cspt02.o cspt03.o csyt01.o csyt02.o csyt03.o \ + ctbt02.o ctbt03.o ctbt05.o ctbt06.o ctpt01.o \ + ctpt02.o ctpt03.o ctpt05.o ctpt06.o ctrt01.o \ +@@ -188,7 +188,7 @@ + zqpt01.o zqrt01.o zqrt01p.o zqrt02.o zqrt03.o zqrt11.o \ + zqrt12.o zqrt13.o zqrt14.o zqrt15.o zqrt16.o \ + zqrt17.o zrqt01.o zrqt02.o zrqt03.o zrzt01.o zrzt02.o \ +- zsbmv.o zspt01.o \ ++ zspt01.o \ + zspt02.o zspt03.o zsyt01.o zsyt02.o zsyt03.o \ + ztbt02.o ztbt03.o ztbt05.o ztbt06.o ztpt01.o \ + ztpt02.o ztpt03.o ztpt05.o ztpt06.o ztrt01.o \ +@@ -214,7 +214,7 @@ + zdrvab.o zdrvac.o zerrab.o zerrac.o zget08.o \ + alaerh.o alahd.o aladhd.o alareq.o \ + chkxer.o zget02.o zlarhs.o zlatb4.o \ +- zsbmv.o xerbla.o zpot06.o zlaipd.o ++ xerbla.o zpot06.o zlaipd.o + + SLINTSTRFP = schkrfp.o sdrvrfp.o sdrvrf1.o sdrvrf2.o sdrvrf3.o sdrvrf4.o serrrfp.o \ + slatb4.o slarhs.o sget04.o spot01.o spot03.o spot02.o \ +@@ -225,11 +225,11 @@ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + CLINTSTRFP = cchkrfp.o cdrvrfp.o cdrvrf1.o cdrvrf2.o cdrvrf3.o cdrvrf4.o cerrrfp.o \ +- claipd.o clatb4.o clarhs.o csbmv.o cget04.o cpot01.o cpot03.o cpot02.o \ ++ claipd.o clatb4.o clarhs.o cget04.o cpot01.o cpot03.o cpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + ZLINTSTRFP = zchkrfp.o zdrvrfp.o zdrvrf1.o zdrvrf2.o zdrvrf3.o zdrvrf4.o zerrrfp.o \ +- zlatb4.o zlaipd.o zlarhs.o zsbmv.o zget04.o zpot01.o zpot03.o zpot02.o \ ++ zlatb4.o zlaipd.o zlarhs.o zget04.o zpot01.o zpot03.o zpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + all: single double complex complex16 proto-single proto-double proto-complex proto-complex16 +@@ -246,43 +246,43 @@ + + xlintsts : $(ALINTST) $(SLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(SLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstc : $(ALINTST) $(CLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(CLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstd : $(ALINTST) $(DLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $^ \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstz : $(ALINTST) $(ZLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(ZLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstds : $(DSLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DSLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstzc : $(ZCLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZCLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfs : $(SLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(SLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfd : $(DLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfc : $(CLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(CLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfz : $(ZLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintsts: xlintsts + mv xlintsts $@ +diff -ruN lapack-3.4.1.old/lapacke/src/Makefile lapack-3.4.1/lapacke/src/Makefile +--- lapack-3.4.1.old/lapacke/src/Makefile 2012-04-02 22:16:32 +0200 ++++ lapack-3.4.1/lapacke/src/Makefile 2012-04-22 21:38:38 +0200 +@@ -2040,19 +2040,21 @@ + lapacke_zlagsy.o \ + lapacke_zlagsy_work.o + +-ALLOBJ = $(SRC_OBJ) $(MATGEN_OBJ) ++OBJ_FILES := $(SRC_OBJ) + +-ifdef USEXBLAS +-ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) ++ifdef LAPACKE_EXTENDED ++OBJ_FILES += $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) + endif + +- +-OBJ_FILES := $(C_FILES:.o=.o) ++ifdef LAPACKE_TESTING ++OBJ_FILES += $(MATGEN_OBJ) ++endif + + all: ../../$(LAPACKELIB) + +-../../$(LAPACKELIB): $(ALLOBJ) $(ALLXOBJ) +- $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJ) $(ALLXOBJ) ++../../$(LAPACKELIB): $(OBJ_FILES) ++# http://hackage.haskell.org/trac/gtk2hs/ticket/1146 ++ echo $(OBJ_FILES) | xargs -n 100 $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) + $(RANLIB) ../../$(LAPACKELIB) + + .c.o: diff --git a/patch.for_lapack-3.4.2 b/patch.for_lapack-3.4.2 new file mode 100644 index 000000000..3f7d72ed3 --- /dev/null +++ b/patch.for_lapack-3.4.2 @@ -0,0 +1,1000 @@ +diff -ruN lapack-3.4.2.old/INSTALL/Makefile lapack-3.4.2/INSTALL/Makefile +--- lapack-3.4.2.old/INSTALL/Makefile 2011-10-01 04:37:03 +0200 ++++ lapack-3.4.2/INSTALL/Makefile 2012-04-22 21:48:48 +0200 +@@ -27,7 +27,7 @@ + $(LOADER) $(LOADOPTS) -o testversion ilaver.o LAPACK_version.o + + clean: +- rm -f *.o ++ rm -f *.o test* + .f.o: + $(FORTRAN) $(OPTS) -c $< -o $@ + +diff -ruN lapack-3.4.2.old/Makefile lapack-3.4.2/Makefile +--- lapack-3.4.2.old/Makefile 2012-04-13 20:13:07 +0200 ++++ lapack-3.4.2/Makefile 2012-04-22 21:48:07 +0200 +@@ -20,9 +20,12 @@ + blaslib: + ( cd BLAS/SRC; $(MAKE) ) + +-lapacklib: lapack_install ++lapacklib: + ( cd SRC; $(MAKE) ) + ++lapack_prof: ++ ( cd SRC; $(MAKE) lapack_prof) ++ + lapackelib: lapacklib + ( cd lapacke; $(MAKE) ) + +diff -ruN lapack-3.4.2.old/SRC/Makefile lapack-3.4.2/SRC/Makefile +--- lapack-3.4.2.old/SRC/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.2/SRC/Makefile 2012-04-22 21:40:21 +0200 +@@ -54,363 +54,371 @@ + # + ####################################################################### + +-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o \ +- ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ +- ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o ++ALLAUX = ilaenv.$(SUFFIX) ieeeck.$(SUFFIX) lsamen.$(SUFFIX) xerbla_array.$(SUFFIX) iparmq.$(SUFFIX) \ ++ ilaprec.$(SUFFIX) ilatrans.$(SUFFIX) ilauplo.$(SUFFIX) iladiag.$(SUFFIX) chla_transtype.$(SUFFIX) \ ++ ../INSTALL/ilaver.$(SUFFIX) + + SCLAUX = \ +- sbdsdc.o \ +- sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ +- slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ +- slaed7.o slaed8.o slaed9.o slaeda.o slaev2.o slagtf.o \ +- slagts.o slamrg.o slanst.o \ +- slapy2.o slapy3.o slarnv.o \ +- slarra.o slarrb.o slarrc.o slarrd.o slarre.o slarrf.o slarrj.o \ +- slarrk.o slarrr.o slaneg.o \ +- slartg.o slaruv.o slas2.o slascl.o \ +- slasd0.o slasd1.o slasd2.o slasd3.o slasd4.o slasd5.o slasd6.o \ +- slasd7.o slasd8.o slasda.o slasdq.o slasdt.o \ +- slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ +- slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ +- ssteqr.o ssterf.o slaisnan.o sisnan.o \ +- slartgp.o slartgs.o \ +- ../INSTALL/second_$(TIMER).o ++ sbdsdc.$(SUFFIX) \ ++ sbdsqr.$(SUFFIX) sdisna.$(SUFFIX) slabad.$(SUFFIX) slacpy.$(SUFFIX) sladiv.$(SUFFIX) slae2.$(SUFFIX) slaebz.$(SUFFIX) \ ++ slaed0.$(SUFFIX) slaed1.$(SUFFIX) slaed2.$(SUFFIX) slaed3.$(SUFFIX) slaed4.$(SUFFIX) slaed5.$(SUFFIX) slaed6.$(SUFFIX) \ ++ slaed7.$(SUFFIX) slaed8.$(SUFFIX) slaed9.$(SUFFIX) slaeda.$(SUFFIX) slaev2.$(SUFFIX) slagtf.$(SUFFIX) \ ++ slagts.$(SUFFIX) slamrg.$(SUFFIX) slanst.$(SUFFIX) \ ++ slapy2.$(SUFFIX) slapy3.$(SUFFIX) slarnv.$(SUFFIX) \ ++ slarra.$(SUFFIX) slarrb.$(SUFFIX) slarrc.$(SUFFIX) slarrd.$(SUFFIX) slarre.$(SUFFIX) slarrf.$(SUFFIX) slarrj.$(SUFFIX) \ ++ slarrk.$(SUFFIX) slarrr.$(SUFFIX) slaneg.$(SUFFIX) \ ++ slartg.$(SUFFIX) slaruv.$(SUFFIX) slas2.$(SUFFIX) slascl.$(SUFFIX) \ ++ slasd0.$(SUFFIX) slasd1.$(SUFFIX) slasd2.$(SUFFIX) slasd3.$(SUFFIX) slasd4.$(SUFFIX) slasd5.$(SUFFIX) slasd6.$(SUFFIX) \ ++ slasd7.$(SUFFIX) slasd8.$(SUFFIX) slasda.$(SUFFIX) slasdq.$(SUFFIX) slasdt.$(SUFFIX) \ ++ slaset.$(SUFFIX) slasq1.$(SUFFIX) slasq2.$(SUFFIX) slasq3.$(SUFFIX) slasq4.$(SUFFIX) slasq5.$(SUFFIX) slasq6.$(SUFFIX) \ ++ slasr.$(SUFFIX) slasrt.$(SUFFIX) slassq.$(SUFFIX) slasv2.$(SUFFIX) spttrf.$(SUFFIX) sstebz.$(SUFFIX) sstedc.$(SUFFIX) \ ++ ssteqr.$(SUFFIX) ssterf.$(SUFFIX) slaisnan.$(SUFFIX) sisnan.$(SUFFIX) \ ++ slartgp.$(SUFFIX) slartgs.$(SUFFIX) \ ++ ../INSTALL/second_$(TIMER).$(SUFFIX) + + DZLAUX = \ +- dbdsdc.o \ +- dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ +- dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ +- dlaed7.o dlaed8.o dlaed9.o dlaeda.o dlaev2.o dlagtf.o \ +- dlagts.o dlamrg.o dlanst.o \ +- dlapy2.o dlapy3.o dlarnv.o \ +- dlarra.o dlarrb.o dlarrc.o dlarrd.o dlarre.o dlarrf.o dlarrj.o \ +- dlarrk.o dlarrr.o dlaneg.o \ +- dlartg.o dlaruv.o dlas2.o dlascl.o \ +- dlasd0.o dlasd1.o dlasd2.o dlasd3.o dlasd4.o dlasd5.o dlasd6.o \ +- dlasd7.o dlasd8.o dlasda.o dlasdq.o dlasdt.o \ +- dlaset.o dlasq1.o dlasq2.o dlasq3.o dlasq4.o dlasq5.o dlasq6.o \ +- dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ +- dsteqr.o dsterf.o dlaisnan.o disnan.o \ +- dlartgp.o dlartgs.o \ +- ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o ++ dbdsdc.$(SUFFIX) \ ++ dbdsqr.$(SUFFIX) ddisna.$(SUFFIX) dlabad.$(SUFFIX) dlacpy.$(SUFFIX) dladiv.$(SUFFIX) dlae2.$(SUFFIX) dlaebz.$(SUFFIX) \ ++ dlaed0.$(SUFFIX) dlaed1.$(SUFFIX) dlaed2.$(SUFFIX) dlaed3.$(SUFFIX) dlaed4.$(SUFFIX) dlaed5.$(SUFFIX) dlaed6.$(SUFFIX) \ ++ dlaed7.$(SUFFIX) dlaed8.$(SUFFIX) dlaed9.$(SUFFIX) dlaeda.$(SUFFIX) dlaev2.$(SUFFIX) dlagtf.$(SUFFIX) \ ++ dlagts.$(SUFFIX) dlamrg.$(SUFFIX) dlanst.$(SUFFIX) \ ++ dlapy2.$(SUFFIX) dlapy3.$(SUFFIX) dlarnv.$(SUFFIX) \ ++ dlarra.$(SUFFIX) dlarrb.$(SUFFIX) dlarrc.$(SUFFIX) dlarrd.$(SUFFIX) dlarre.$(SUFFIX) dlarrf.$(SUFFIX) dlarrj.$(SUFFIX) \ ++ dlarrk.$(SUFFIX) dlarrr.$(SUFFIX) dlaneg.$(SUFFIX) \ ++ dlartg.$(SUFFIX) dlaruv.$(SUFFIX) dlas2.$(SUFFIX) dlascl.$(SUFFIX) \ ++ dlasd0.$(SUFFIX) dlasd1.$(SUFFIX) dlasd2.$(SUFFIX) dlasd3.$(SUFFIX) dlasd4.$(SUFFIX) dlasd5.$(SUFFIX) dlasd6.$(SUFFIX) \ ++ dlasd7.$(SUFFIX) dlasd8.$(SUFFIX) dlasda.$(SUFFIX) dlasdq.$(SUFFIX) dlasdt.$(SUFFIX) \ ++ dlaset.$(SUFFIX) dlasq1.$(SUFFIX) dlasq2.$(SUFFIX) dlasq3.$(SUFFIX) dlasq4.$(SUFFIX) dlasq5.$(SUFFIX) dlasq6.$(SUFFIX) \ ++ dlasr.$(SUFFIX) dlasrt.$(SUFFIX) dlassq.$(SUFFIX) dlasv2.$(SUFFIX) dpttrf.$(SUFFIX) dstebz.$(SUFFIX) dstedc.$(SUFFIX) \ ++ dsteqr.$(SUFFIX) dsterf.$(SUFFIX) dlaisnan.$(SUFFIX) disnan.$(SUFFIX) \ ++ dlartgp.$(SUFFIX) dlartgs.$(SUFFIX) \ ++ ../INSTALL/dsecnd_$(TIMER).$(SUFFIX) + + SLASRC = \ +- sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ +- sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ +- sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ +- sgegs.o sgegv.o sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ +- sgels.o sgelsd.o sgelss.o sgelsx.o sgelsy.o sgeql2.o sgeqlf.o \ +- sgeqp3.o sgeqpf.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ +- sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvx.o \ +- sgetc2.o sgetf2.o sgetri.o \ +- sggbak.o sggbal.o sgges.o sggesx.o sggev.o sggevx.o \ +- sggglm.o sgghrd.o sgglse.o sggqrf.o \ +- sggrqf.o sggsvd.o sggsvp.o sgtcon.o sgtrfs.o sgtsv.o \ +- sgtsvx.o sgttrf.o sgttrs.o sgtts2.o shgeqz.o \ +- shsein.o shseqr.o slabrd.o slacon.o slacn2.o \ +- slaein.o slaexc.o slag2.o slags2.o slagtm.o slagv2.o slahqr.o \ +- slahrd.o slahr2.o slaic1.o slaln2.o slals0.o slalsa.o slalsd.o \ +- slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ +- slansy.o slantb.o slantp.o slantr.o slanv2.o \ +- slapll.o slapmt.o \ +- slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ +- slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ +- slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ +- slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slargv.o \ +- slarrv.o slartv.o \ +- slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o \ +- slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ +- slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ +- sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ +- sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ +- sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ +- sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ +- spbstf.o spbsv.o spbsvx.o \ +- spbtf2.o spbtrf.o spbtrs.o spocon.o spoequ.o sporfs.o sposv.o \ +- sposvx.o spotf2.o spotri.o spstrf.o spstf2.o \ +- sppcon.o sppequ.o \ +- spprfs.o sppsv.o sppsvx.o spptrf.o spptri.o spptrs.o sptcon.o \ +- spteqr.o sptrfs.o sptsv.o sptsvx.o spttrs.o sptts2.o srscl.o \ +- ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ +- ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ +- sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ +- ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ +- sstevx.o \ +- ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ +- ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ +- ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytri2.o ssytri2x.o \ +- ssyswapr.o ssytrs.o ssytrs2.o ssyconv.o \ +- stbcon.o \ +- stbrfs.o stbtrs.o stgevc.o stgex2.o stgexc.o stgsen.o \ +- stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ +- stptrs.o \ +- strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ +- strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ +- slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ +- stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ +- sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ +- sgeequb.o ssyequb.o spoequb.o sgbequb.o \ +- sbbcsd.o slapmr.o sorbdb.o sorcsd.o \ +- sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ +- stpqrt.o stpqrt2.o stpmqrt.o stprfb.o ++ sgbbrd.$(SUFFIX) sgbcon.$(SUFFIX) sgbequ.$(SUFFIX) sgbrfs.$(SUFFIX) sgbsv.$(SUFFIX) \ ++ sgbsvx.$(SUFFIX) sgbtf2.$(SUFFIX) sgbtrf.$(SUFFIX) sgbtrs.$(SUFFIX) sgebak.$(SUFFIX) sgebal.$(SUFFIX) sgebd2.$(SUFFIX) \ ++ sgebrd.$(SUFFIX) sgecon.$(SUFFIX) sgeequ.$(SUFFIX) sgees.$(SUFFIX) sgeesx.$(SUFFIX) sgeev.$(SUFFIX) sgeevx.$(SUFFIX) \ ++ sgegs.$(SUFFIX) sgegv.$(SUFFIX) sgehd2.$(SUFFIX) sgehrd.$(SUFFIX) sgelq2.$(SUFFIX) sgelqf.$(SUFFIX) \ ++ sgels.$(SUFFIX) sgelsd.$(SUFFIX) sgelss.$(SUFFIX) sgelsx.$(SUFFIX) sgelsy.$(SUFFIX) sgeql2.$(SUFFIX) sgeqlf.$(SUFFIX) \ ++ sgeqp3.$(SUFFIX) sgeqpf.$(SUFFIX) sgeqr2.$(SUFFIX) sgeqr2p.$(SUFFIX) sgeqrf.$(SUFFIX) sgeqrfp.$(SUFFIX) sgerfs.$(SUFFIX) \ ++ sgerq2.$(SUFFIX) sgerqf.$(SUFFIX) sgesc2.$(SUFFIX) sgesdd.$(SUFFIX) sgesv.$(SUFFIX) sgesvd.$(SUFFIX) sgesvx.$(SUFFIX) \ ++ sgetc2.$(SUFFIX) sgetri.$(SUFFIX) \ ++ sggbak.$(SUFFIX) sggbal.$(SUFFIX) sgges.$(SUFFIX) sggesx.$(SUFFIX) sggev.$(SUFFIX) sggevx.$(SUFFIX) \ ++ sggglm.$(SUFFIX) sgghrd.$(SUFFIX) sgglse.$(SUFFIX) sggqrf.$(SUFFIX) \ ++ sggrqf.$(SUFFIX) sggsvd.$(SUFFIX) sggsvp.$(SUFFIX) sgtcon.$(SUFFIX) sgtrfs.$(SUFFIX) sgtsv.$(SUFFIX) \ ++ sgtsvx.$(SUFFIX) sgttrf.$(SUFFIX) sgttrs.$(SUFFIX) sgtts2.$(SUFFIX) shgeqz.$(SUFFIX) \ ++ shsein.$(SUFFIX) shseqr.$(SUFFIX) slabrd.$(SUFFIX) slacon.$(SUFFIX) slacn2.$(SUFFIX) \ ++ slaein.$(SUFFIX) slaexc.$(SUFFIX) slag2.$(SUFFIX) slags2.$(SUFFIX) slagtm.$(SUFFIX) slagv2.$(SUFFIX) slahqr.$(SUFFIX) \ ++ slahrd.$(SUFFIX) slahr2.$(SUFFIX) slaic1.$(SUFFIX) slaln2.$(SUFFIX) slals0.$(SUFFIX) slalsa.$(SUFFIX) slalsd.$(SUFFIX) \ ++ slangb.$(SUFFIX) slange.$(SUFFIX) slangt.$(SUFFIX) slanhs.$(SUFFIX) slansb.$(SUFFIX) slansp.$(SUFFIX) \ ++ slansy.$(SUFFIX) slantb.$(SUFFIX) slantp.$(SUFFIX) slantr.$(SUFFIX) slanv2.$(SUFFIX) \ ++ slapll.$(SUFFIX) slapmt.$(SUFFIX) \ ++ slaqgb.$(SUFFIX) slaqge.$(SUFFIX) slaqp2.$(SUFFIX) slaqps.$(SUFFIX) slaqsb.$(SUFFIX) slaqsp.$(SUFFIX) slaqsy.$(SUFFIX) \ ++ slaqr0.$(SUFFIX) slaqr1.$(SUFFIX) slaqr2.$(SUFFIX) slaqr3.$(SUFFIX) slaqr4.$(SUFFIX) slaqr5.$(SUFFIX) \ ++ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ ++ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ ++ slarrv.$(SUFFIX) slartv.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ ++ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ ++ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ ++ sorgrq.$(SUFFIX) sorgtr.$(SUFFIX) sorm2l.$(SUFFIX) sorm2r.$(SUFFIX) \ ++ sormbr.$(SUFFIX) sormhr.$(SUFFIX) sorml2.$(SUFFIX) sormlq.$(SUFFIX) sormql.$(SUFFIX) sormqr.$(SUFFIX) sormr2.$(SUFFIX) \ ++ sormr3.$(SUFFIX) sormrq.$(SUFFIX) sormrz.$(SUFFIX) sormtr.$(SUFFIX) spbcon.$(SUFFIX) spbequ.$(SUFFIX) spbrfs.$(SUFFIX) \ ++ spbstf.$(SUFFIX) spbsv.$(SUFFIX) spbsvx.$(SUFFIX) \ ++ spbtf2.$(SUFFIX) spbtrf.$(SUFFIX) spbtrs.$(SUFFIX) spocon.$(SUFFIX) spoequ.$(SUFFIX) sporfs.$(SUFFIX) sposv.$(SUFFIX) \ ++ sposvx.$(SUFFIX) spotri.$(SUFFIX) spstrf.$(SUFFIX) spstf2.$(SUFFIX) \ ++ sppcon.$(SUFFIX) sppequ.$(SUFFIX) \ ++ spprfs.$(SUFFIX) sppsv.$(SUFFIX) sppsvx.$(SUFFIX) spptrf.$(SUFFIX) spptri.$(SUFFIX) spptrs.$(SUFFIX) sptcon.$(SUFFIX) \ ++ spteqr.$(SUFFIX) sptrfs.$(SUFFIX) sptsv.$(SUFFIX) sptsvx.$(SUFFIX) spttrs.$(SUFFIX) sptts2.$(SUFFIX) srscl.$(SUFFIX) \ ++ ssbev.$(SUFFIX) ssbevd.$(SUFFIX) ssbevx.$(SUFFIX) ssbgst.$(SUFFIX) ssbgv.$(SUFFIX) ssbgvd.$(SUFFIX) ssbgvx.$(SUFFIX) \ ++ ssbtrd.$(SUFFIX) sspcon.$(SUFFIX) sspev.$(SUFFIX) sspevd.$(SUFFIX) sspevx.$(SUFFIX) sspgst.$(SUFFIX) \ ++ sspgv.$(SUFFIX) sspgvd.$(SUFFIX) sspgvx.$(SUFFIX) ssprfs.$(SUFFIX) sspsv.$(SUFFIX) sspsvx.$(SUFFIX) ssptrd.$(SUFFIX) \ ++ ssptrf.$(SUFFIX) ssptri.$(SUFFIX) ssptrs.$(SUFFIX) sstegr.$(SUFFIX) sstein.$(SUFFIX) sstev.$(SUFFIX) sstevd.$(SUFFIX) sstevr.$(SUFFIX) \ ++ sstevx.$(SUFFIX) \ ++ ssycon.$(SUFFIX) ssyev.$(SUFFIX) ssyevd.$(SUFFIX) ssyevr.$(SUFFIX) ssyevx.$(SUFFIX) ssygs2.$(SUFFIX) \ ++ ssygst.$(SUFFIX) ssygv.$(SUFFIX) ssygvd.$(SUFFIX) ssygvx.$(SUFFIX) ssyrfs.$(SUFFIX) ssysv.$(SUFFIX) ssysvx.$(SUFFIX) \ ++ ssytd2.$(SUFFIX) ssytf2.$(SUFFIX) ssytrd.$(SUFFIX) ssytrf.$(SUFFIX) ssytri.$(SUFFIX) ssytri2.$(SUFFIX) ssytri2x.$(SUFFIX) \ ++ ssyswapr.$(SUFFIX) ssytrs.$(SUFFIX) ssytrs2.$(SUFFIX) ssyconv.$(SUFFIX) \ ++ stbcon.$(SUFFIX) \ ++ stbrfs.$(SUFFIX) stbtrs.$(SUFFIX) stgevc.$(SUFFIX) stgex2.$(SUFFIX) stgexc.$(SUFFIX) stgsen.$(SUFFIX) \ ++ stgsja.$(SUFFIX) stgsna.$(SUFFIX) stgsy2.$(SUFFIX) stgsyl.$(SUFFIX) stpcon.$(SUFFIX) stprfs.$(SUFFIX) stptri.$(SUFFIX) \ ++ stptrs.$(SUFFIX) \ ++ strcon.$(SUFFIX) strevc.$(SUFFIX) strexc.$(SUFFIX) strrfs.$(SUFFIX) strsen.$(SUFFIX) strsna.$(SUFFIX) strsyl.$(SUFFIX) \ ++ strtrs.$(SUFFIX) stzrqf.$(SUFFIX) stzrzf.$(SUFFIX) sstemr.$(SUFFIX) \ ++ slansf.$(SUFFIX) spftrf.$(SUFFIX) spftri.$(SUFFIX) spftrs.$(SUFFIX) ssfrk.$(SUFFIX) stfsm.$(SUFFIX) stftri.$(SUFFIX) stfttp.$(SUFFIX) \ ++ stfttr.$(SUFFIX) stpttf.$(SUFFIX) stpttr.$(SUFFIX) strttf.$(SUFFIX) strttp.$(SUFFIX) \ ++ sgejsv.$(SUFFIX) sgesvj.$(SUFFIX) sgsvj0.$(SUFFIX) sgsvj1.$(SUFFIX) \ ++ sgeequb.$(SUFFIX) ssyequb.$(SUFFIX) spoequb.$(SUFFIX) sgbequb.$(SUFFIX) \ ++ sbbcsd.$(SUFFIX) slapmr.$(SUFFIX) sorbdb.$(SUFFIX) sorcsd.$(SUFFIX) \ ++ sgeqrt.$(SUFFIX) sgeqrt2.$(SUFFIX) sgeqrt3.$(SUFFIX) sgemqrt.$(SUFFIX) \ ++ stpqrt.$(SUFFIX) stpqrt2.$(SUFFIX) stpmqrt.$(SUFFIX) stprfb.$(SUFFIX) + +-DSLASRC = spotrs.o sgetrs.o spotrf.o sgetrf.o ++DSLASRC = spotrs.$(SUFFIX) + + ifdef USEXBLAS +-SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ +- sla_gercond.o sla_gerpvgrw.o ssysvxx.o ssyrfsx.o \ +- sla_syrfsx_extended.o sla_syamv.o sla_syrcond.o sla_syrpvgrw.o \ +- sposvxx.o sporfsx.o sla_porfsx_extended.o sla_porcond.o \ +- sla_porpvgrw.o sgbsvxx.o sgbrfsx.o sla_gbrfsx_extended.o \ +- sla_gbamv.o sla_gbrcond.o sla_gbrpvgrw.o sla_lin_berr.o slarscl2.o \ +- slascl2.o sla_wwaddw.o ++SXLASRC = sgesvxx.$(SUFFIX) sgerfsx.$(SUFFIX) sla_gerfsx_extended.$(SUFFIX) sla_geamv.$(SUFFIX) \ ++ sla_gercond.$(SUFFIX) sla_gerpvgrw.$(SUFFIX) ssysvxx.$(SUFFIX) ssyrfsx.$(SUFFIX) \ ++ sla_syrfsx_extended.$(SUFFIX) sla_syamv.$(SUFFIX) sla_syrcond.$(SUFFIX) sla_syrpvgrw.$(SUFFIX) \ ++ sposvxx.$(SUFFIX) sporfsx.$(SUFFIX) sla_porfsx_extended.$(SUFFIX) sla_porcond.$(SUFFIX) \ ++ sla_porpvgrw.$(SUFFIX) sgbsvxx.$(SUFFIX) sgbrfsx.$(SUFFIX) sla_gbrfsx_extended.$(SUFFIX) \ ++ sla_gbamv.$(SUFFIX) sla_gbrcond.$(SUFFIX) sla_gbrpvgrw.$(SUFFIX) sla_lin_berr.$(SUFFIX) slarscl2.$(SUFFIX) \ ++ slascl2.$(SUFFIX) sla_wwaddw.$(SUFFIX) + endif + + CLASRC = \ +- cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ +- cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ +- cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ +- cgegs.o cgegv.o cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ +- cgels.o cgelsd.o cgelss.o cgelsx.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ +- cgeqpf.o cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \ +- cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o cgesvd.o \ +- cgesvx.o cgetc2.o cgetf2.o cgetri.o \ +- cggbak.o cggbal.o cgges.o cggesx.o cggev.o cggevx.o cggglm.o \ +- cgghrd.o cgglse.o cggqrf.o cggrqf.o \ +- cggsvd.o cggsvp.o \ +- cgtcon.o cgtrfs.o cgtsv.o cgtsvx.o cgttrf.o cgttrs.o cgtts2.o chbev.o \ +- chbevd.o chbevx.o chbgst.o chbgv.o chbgvd.o chbgvx.o chbtrd.o \ +- checon.o cheev.o cheevd.o cheevr.o cheevx.o chegs2.o chegst.o \ +- chegv.o chegvd.o chegvx.o cherfs.o chesv.o chesvx.o chetd2.o \ +- chetf2.o chetrd.o \ +- chetrf.o chetri.o chetri2.o chetri2x.o cheswapr.o \ +- chetrs.o chetrs2.o chgeqz.o chpcon.o chpev.o chpevd.o \ +- chpevx.o chpgst.o chpgv.o chpgvd.o chpgvx.o chprfs.o chpsv.o \ +- chpsvx.o \ +- chptrd.o chptrf.o chptri.o chptrs.o chsein.o chseqr.o clabrd.o \ +- clacgv.o clacon.o clacn2.o clacp2.o clacpy.o clacrm.o clacrt.o cladiv.o \ +- claed0.o claed7.o claed8.o \ +- claein.o claesy.o claev2.o clags2.o clagtm.o \ +- clahef.o clahqr.o \ +- clahrd.o clahr2.o claic1.o clals0.o clalsa.o clalsd.o clangb.o clange.o clangt.o \ +- clanhb.o clanhe.o \ +- clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ +- clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ +- claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ +- claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ +- claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ +- clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ +- clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ +- clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ +- claswp.o clasyf.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ +- clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ +- cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ +- cposv.o cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \ +- cppcon.o cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ +- cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ +- crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ +- cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \ +- cstegr.o cstein.o csteqr.o \ +- csycon.o csymv.o \ +- csyr.o csyrfs.o csysv.o csysvx.o csytf2.o csytrf.o csytri.o csytri2.o csytri2x.o \ +- csyswapr.o csytrs.o csytrs2.o csyconv.o \ +- ctbcon.o ctbrfs.o ctbtrs.o ctgevc.o ctgex2.o \ +- ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ +- ctprfs.o ctptri.o \ +- ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ +- ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ +- cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ +- cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ +- cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ +- cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ +- chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ +- ctfttr.o ctpttf.o ctpttr.o ctrttf.o ctrttp.o \ +- cgeequb.o cgbequb.o csyequb.o cpoequb.o cheequb.o \ +- cbbcsd.o clapmr.o cunbdb.o cuncsd.o \ +- cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \ +- ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o ++ cbdsqr.$(SUFFIX) cgbbrd.$(SUFFIX) cgbcon.$(SUFFIX) cgbequ.$(SUFFIX) cgbrfs.$(SUFFIX) cgbsv.$(SUFFIX) cgbsvx.$(SUFFIX) \ ++ cgbtf2.$(SUFFIX) cgbtrf.$(SUFFIX) cgbtrs.$(SUFFIX) cgebak.$(SUFFIX) cgebal.$(SUFFIX) cgebd2.$(SUFFIX) cgebrd.$(SUFFIX) \ ++ cgecon.$(SUFFIX) cgeequ.$(SUFFIX) cgees.$(SUFFIX) cgeesx.$(SUFFIX) cgeev.$(SUFFIX) cgeevx.$(SUFFIX) \ ++ cgegs.$(SUFFIX) cgegv.$(SUFFIX) cgehd2.$(SUFFIX) cgehrd.$(SUFFIX) cgelq2.$(SUFFIX) cgelqf.$(SUFFIX) \ ++ cgels.$(SUFFIX) cgelsd.$(SUFFIX) cgelss.$(SUFFIX) cgelsx.$(SUFFIX) cgelsy.$(SUFFIX) cgeql2.$(SUFFIX) cgeqlf.$(SUFFIX) cgeqp3.$(SUFFIX) \ ++ cgeqpf.$(SUFFIX) cgeqr2.$(SUFFIX) cgeqr2p.$(SUFFIX) cgeqrf.$(SUFFIX) cgeqrfp.$(SUFFIX) cgerfs.$(SUFFIX) \ ++ cgerq2.$(SUFFIX) cgerqf.$(SUFFIX) cgesc2.$(SUFFIX) cgesdd.$(SUFFIX) cgesv.$(SUFFIX) cgesvd.$(SUFFIX) \ ++ cgesvx.$(SUFFIX) cgetc2.$(SUFFIX) cgetri.$(SUFFIX) \ ++ cggbak.$(SUFFIX) cggbal.$(SUFFIX) cgges.$(SUFFIX) cggesx.$(SUFFIX) cggev.$(SUFFIX) cggevx.$(SUFFIX) cggglm.$(SUFFIX) \ ++ cgghrd.$(SUFFIX) cgglse.$(SUFFIX) cggqrf.$(SUFFIX) cggrqf.$(SUFFIX) \ ++ cggsvd.$(SUFFIX) cggsvp.$(SUFFIX) \ ++ cgtcon.$(SUFFIX) cgtrfs.$(SUFFIX) cgtsv.$(SUFFIX) cgtsvx.$(SUFFIX) cgttrf.$(SUFFIX) cgttrs.$(SUFFIX) cgtts2.$(SUFFIX) chbev.$(SUFFIX) \ ++ chbevd.$(SUFFIX) chbevx.$(SUFFIX) chbgst.$(SUFFIX) chbgv.$(SUFFIX) chbgvd.$(SUFFIX) chbgvx.$(SUFFIX) chbtrd.$(SUFFIX) \ ++ checon.$(SUFFIX) cheev.$(SUFFIX) cheevd.$(SUFFIX) cheevr.$(SUFFIX) cheevx.$(SUFFIX) chegs2.$(SUFFIX) chegst.$(SUFFIX) \ ++ chegv.$(SUFFIX) chegvd.$(SUFFIX) chegvx.$(SUFFIX) cherfs.$(SUFFIX) chesv.$(SUFFIX) chesvx.$(SUFFIX) chetd2.$(SUFFIX) \ ++ chetf2.$(SUFFIX) chetrd.$(SUFFIX) \ ++ chetrf.$(SUFFIX) chetri.$(SUFFIX) chetri2.$(SUFFIX) chetri2x.$(SUFFIX) cheswapr.$(SUFFIX) \ ++ chetrs.$(SUFFIX) chetrs2.$(SUFFIX) chgeqz.$(SUFFIX) chpcon.$(SUFFIX) chpev.$(SUFFIX) chpevd.$(SUFFIX) \ ++ chpevx.$(SUFFIX) chpgst.$(SUFFIX) chpgv.$(SUFFIX) chpgvd.$(SUFFIX) chpgvx.$(SUFFIX) chprfs.$(SUFFIX) chpsv.$(SUFFIX) \ ++ chpsvx.$(SUFFIX) \ ++ chptrd.$(SUFFIX) chptrf.$(SUFFIX) chptri.$(SUFFIX) chptrs.$(SUFFIX) chsein.$(SUFFIX) chseqr.$(SUFFIX) clabrd.$(SUFFIX) \ ++ clacgv.$(SUFFIX) clacon.$(SUFFIX) clacn2.$(SUFFIX) clacp2.$(SUFFIX) clacpy.$(SUFFIX) clacrm.$(SUFFIX) clacrt.$(SUFFIX) cladiv.$(SUFFIX) \ ++ claed0.$(SUFFIX) claed7.$(SUFFIX) claed8.$(SUFFIX) \ ++ claein.$(SUFFIX) claesy.$(SUFFIX) claev2.$(SUFFIX) clags2.$(SUFFIX) clagtm.$(SUFFIX) \ ++ clahef.$(SUFFIX) clahqr.$(SUFFIX) \ ++ clahrd.$(SUFFIX) clahr2.$(SUFFIX) claic1.$(SUFFIX) clals0.$(SUFFIX) clalsa.$(SUFFIX) clalsd.$(SUFFIX) clangb.$(SUFFIX) clange.$(SUFFIX) clangt.$(SUFFIX) \ ++ clanhb.$(SUFFIX) clanhe.$(SUFFIX) \ ++ clanhp.$(SUFFIX) clanhs.$(SUFFIX) clanht.$(SUFFIX) clansb.$(SUFFIX) clansp.$(SUFFIX) clansy.$(SUFFIX) clantb.$(SUFFIX) \ ++ clantp.$(SUFFIX) clantr.$(SUFFIX) clapll.$(SUFFIX) clapmt.$(SUFFIX) clarcm.$(SUFFIX) claqgb.$(SUFFIX) claqge.$(SUFFIX) \ ++ claqhb.$(SUFFIX) claqhe.$(SUFFIX) claqhp.$(SUFFIX) claqp2.$(SUFFIX) claqps.$(SUFFIX) claqsb.$(SUFFIX) \ ++ claqr0.$(SUFFIX) claqr1.$(SUFFIX) claqr2.$(SUFFIX) claqr3.$(SUFFIX) claqr4.$(SUFFIX) claqr5.$(SUFFIX) \ ++ claqsp.$(SUFFIX) claqsy.$(SUFFIX) clar1v.$(SUFFIX) clar2v.$(SUFFIX) ilaclr.$(SUFFIX) ilaclc.$(SUFFIX) \ ++ clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ ++ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ ++ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ ++ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ ++ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ ++ cppcon.$(SUFFIX) cppequ.$(SUFFIX) cpprfs.$(SUFFIX) cppsv.$(SUFFIX) cppsvx.$(SUFFIX) cpptrf.$(SUFFIX) cpptri.$(SUFFIX) cpptrs.$(SUFFIX) \ ++ cptcon.$(SUFFIX) cpteqr.$(SUFFIX) cptrfs.$(SUFFIX) cptsv.$(SUFFIX) cptsvx.$(SUFFIX) cpttrf.$(SUFFIX) cpttrs.$(SUFFIX) cptts2.$(SUFFIX) \ ++ crot.$(SUFFIX) cspcon.$(SUFFIX) cspmv.$(SUFFIX) cspr.$(SUFFIX) csprfs.$(SUFFIX) cspsv.$(SUFFIX) \ ++ cspsvx.$(SUFFIX) csptrf.$(SUFFIX) csptri.$(SUFFIX) csptrs.$(SUFFIX) csrscl.$(SUFFIX) cstedc.$(SUFFIX) \ ++ cstegr.$(SUFFIX) cstein.$(SUFFIX) csteqr.$(SUFFIX) \ ++ csycon.$(SUFFIX) csymv.$(SUFFIX) \ ++ csyr.$(SUFFIX) csyrfs.$(SUFFIX) csysv.$(SUFFIX) csysvx.$(SUFFIX) csytf2.$(SUFFIX) csytrf.$(SUFFIX) csytri.$(SUFFIX) csytri2.$(SUFFIX) csytri2x.$(SUFFIX) \ ++ csyswapr.$(SUFFIX) csytrs.$(SUFFIX) csytrs2.$(SUFFIX) csyconv.$(SUFFIX) \ ++ ctbcon.$(SUFFIX) ctbrfs.$(SUFFIX) ctbtrs.$(SUFFIX) ctgevc.$(SUFFIX) ctgex2.$(SUFFIX) \ ++ ctgexc.$(SUFFIX) ctgsen.$(SUFFIX) ctgsja.$(SUFFIX) ctgsna.$(SUFFIX) ctgsy2.$(SUFFIX) ctgsyl.$(SUFFIX) ctpcon.$(SUFFIX) \ ++ ctprfs.$(SUFFIX) ctptri.$(SUFFIX) \ ++ ctptrs.$(SUFFIX) ctrcon.$(SUFFIX) ctrevc.$(SUFFIX) ctrexc.$(SUFFIX) ctrrfs.$(SUFFIX) ctrsen.$(SUFFIX) ctrsna.$(SUFFIX) \ ++ ctrsyl.$(SUFFIX) ctrtrs.$(SUFFIX) ctzrqf.$(SUFFIX) ctzrzf.$(SUFFIX) cung2l.$(SUFFIX) cung2r.$(SUFFIX) \ ++ cungbr.$(SUFFIX) cunghr.$(SUFFIX) cungl2.$(SUFFIX) cunglq.$(SUFFIX) cungql.$(SUFFIX) cungqr.$(SUFFIX) cungr2.$(SUFFIX) \ ++ cungrq.$(SUFFIX) cungtr.$(SUFFIX) cunm2l.$(SUFFIX) cunm2r.$(SUFFIX) cunmbr.$(SUFFIX) cunmhr.$(SUFFIX) cunml2.$(SUFFIX) \ ++ cunmlq.$(SUFFIX) cunmql.$(SUFFIX) cunmqr.$(SUFFIX) cunmr2.$(SUFFIX) cunmr3.$(SUFFIX) cunmrq.$(SUFFIX) cunmrz.$(SUFFIX) \ ++ cunmtr.$(SUFFIX) cupgtr.$(SUFFIX) cupmtr.$(SUFFIX) icmax1.$(SUFFIX) scsum1.$(SUFFIX) cstemr.$(SUFFIX) \ ++ chfrk.$(SUFFIX) ctfttp.$(SUFFIX) clanhf.$(SUFFIX) cpftrf.$(SUFFIX) cpftri.$(SUFFIX) cpftrs.$(SUFFIX) ctfsm.$(SUFFIX) ctftri.$(SUFFIX) \ ++ ctfttr.$(SUFFIX) ctpttf.$(SUFFIX) ctpttr.$(SUFFIX) ctrttf.$(SUFFIX) ctrttp.$(SUFFIX) \ ++ cgeequb.$(SUFFIX) cgbequb.$(SUFFIX) csyequb.$(SUFFIX) cpoequb.$(SUFFIX) cheequb.$(SUFFIX) \ ++ cbbcsd.$(SUFFIX) clapmr.$(SUFFIX) cunbdb.$(SUFFIX) cuncsd.$(SUFFIX) \ ++ cgeqrt.$(SUFFIX) cgeqrt2.$(SUFFIX) cgeqrt3.$(SUFFIX) cgemqrt.$(SUFFIX) \ ++ ctpqrt.$(SUFFIX) ctpqrt2.$(SUFFIX) ctpmqrt.$(SUFFIX) ctprfb.$(SUFFIX) + + ifdef USEXBLAS +-CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ +- cla_gercond_c.o cla_gercond_x.o cla_gerpvgrw.o \ +- csysvxx.o csyrfsx.o cla_syrfsx_extended.o cla_syamv.o \ +- cla_syrcond_c.o cla_syrcond_x.o cla_syrpvgrw.o \ +- cposvxx.o cporfsx.o cla_porfsx_extended.o \ +- cla_porcond_c.o cla_porcond_x.o cla_porpvgrw.o \ +- cgbsvxx.o cgbrfsx.o cla_gbrfsx_extended.o cla_gbamv.o \ +- cla_gbrcond_c.o cla_gbrcond_x.o cla_gbrpvgrw.o \ +- chesvxx.o cherfsx.o cla_herfsx_extended.o cla_heamv.o \ +- cla_hercond_c.o cla_hercond_x.o cla_herpvgrw.o \ +- cla_lin_berr.o clarscl2.o clascl2.o cla_wwaddw.o ++CXLASRC = cgesvxx.$(SUFFIX) cgerfsx.$(SUFFIX) cla_gerfsx_extended.$(SUFFIX) cla_geamv.$(SUFFIX) \ ++ cla_gercond_c.$(SUFFIX) cla_gercond_x.$(SUFFIX) cla_gerpvgrw.$(SUFFIX) \ ++ csysvxx.$(SUFFIX) csyrfsx.$(SUFFIX) cla_syrfsx_extended.$(SUFFIX) cla_syamv.$(SUFFIX) \ ++ cla_syrcond_c.$(SUFFIX) cla_syrcond_x.$(SUFFIX) cla_syrpvgrw.$(SUFFIX) \ ++ cposvxx.$(SUFFIX) cporfsx.$(SUFFIX) cla_porfsx_extended.$(SUFFIX) \ ++ cla_porcond_c.$(SUFFIX) cla_porcond_x.$(SUFFIX) cla_porpvgrw.$(SUFFIX) \ ++ cgbsvxx.$(SUFFIX) cgbrfsx.$(SUFFIX) cla_gbrfsx_extended.$(SUFFIX) cla_gbamv.$(SUFFIX) \ ++ cla_gbrcond_c.$(SUFFIX) cla_gbrcond_x.$(SUFFIX) cla_gbrpvgrw.$(SUFFIX) \ ++ chesvxx.$(SUFFIX) cherfsx.$(SUFFIX) cla_herfsx_extended.$(SUFFIX) cla_heamv.$(SUFFIX) \ ++ cla_hercond_c.$(SUFFIX) cla_hercond_x.$(SUFFIX) cla_herpvgrw.$(SUFFIX) \ ++ cla_lin_berr.$(SUFFIX) clarscl2.$(SUFFIX) clascl2.$(SUFFIX) cla_wwaddw.$(SUFFIX) + endif + +-ZCLASRC = cpotrs.o cgetrs.o cpotrf.o cgetrf.o ++ZCLASRC = cpotrs.$(SUFFIX) + + DLASRC = \ +- dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ +- dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ +- dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ +- dgegs.o dgegv.o dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ +- dgels.o dgelsd.o dgelss.o dgelsx.o dgelsy.o dgeql2.o dgeqlf.o \ +- dgeqp3.o dgeqpf.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ +- dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvx.o \ +- dgetc2.o dgetf2.o dgetrf.o dgetri.o \ +- dgetrs.o dggbak.o dggbal.o dgges.o dggesx.o dggev.o dggevx.o \ +- dggglm.o dgghrd.o dgglse.o dggqrf.o \ +- dggrqf.o dggsvd.o dggsvp.o dgtcon.o dgtrfs.o dgtsv.o \ +- dgtsvx.o dgttrf.o dgttrs.o dgtts2.o dhgeqz.o \ +- dhsein.o dhseqr.o dlabrd.o dlacon.o dlacn2.o \ +- dlaein.o dlaexc.o dlag2.o dlags2.o dlagtm.o dlagv2.o dlahqr.o \ +- dlahrd.o dlahr2.o dlaic1.o dlaln2.o dlals0.o dlalsa.o dlalsd.o \ +- dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ +- dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ +- dlapll.o dlapmt.o \ +- dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ +- dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ +- dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ +- dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \ +- dlargv.o dlarrv.o dlartv.o \ +- dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o dlasyf.o \ +- dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ +- dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ +- dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ +- dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ +- dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ +- dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ +- dpbstf.o dpbsv.o dpbsvx.o \ +- dpbtf2.o dpbtrf.o dpbtrs.o dpocon.o dpoequ.o dporfs.o dposv.o \ +- dposvx.o dpotf2.o dpotrf.o dpotri.o dpotrs.o dpstrf.o dpstf2.o \ +- dppcon.o dppequ.o \ +- dpprfs.o dppsv.o dppsvx.o dpptrf.o dpptri.o dpptrs.o dptcon.o \ +- dpteqr.o dptrfs.o dptsv.o dptsvx.o dpttrs.o dptts2.o drscl.o \ +- dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ +- dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ +- dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ +- dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ +- dstevx.o \ +- dsycon.o dsyev.o dsyevd.o dsyevr.o \ +- dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ +- dsysv.o dsysvx.o \ +- dsytd2.o dsytf2.o dsytrd.o dsytrf.o dsytri.o dsytri2.o dsytri2x.o \ +- dsyswapr.o dsytrs.o dsytrs2.o dsyconv.o \ +- dtbcon.o dtbrfs.o dtbtrs.o dtgevc.o dtgex2.o dtgexc.o dtgsen.o \ +- dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ +- dtptrs.o \ +- dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ +- dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ +- dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ +- dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ +- dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ +- dgejsv.o dgesvj.o dgsvj0.o dgsvj1.o \ +- dgeequb.o dsyequb.o dpoequb.o dgbequb.o \ +- dbbcsd.o dlapmr.o dorbdb.o dorcsd.o \ +- dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \ +- dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o ++ dgbbrd.$(SUFFIX) dgbcon.$(SUFFIX) dgbequ.$(SUFFIX) dgbrfs.$(SUFFIX) dgbsv.$(SUFFIX) \ ++ dgbsvx.$(SUFFIX) dgbtf2.$(SUFFIX) dgbtrf.$(SUFFIX) dgbtrs.$(SUFFIX) dgebak.$(SUFFIX) dgebal.$(SUFFIX) dgebd2.$(SUFFIX) \ ++ dgebrd.$(SUFFIX) dgecon.$(SUFFIX) dgeequ.$(SUFFIX) dgees.$(SUFFIX) dgeesx.$(SUFFIX) dgeev.$(SUFFIX) dgeevx.$(SUFFIX) \ ++ dgegs.$(SUFFIX) dgegv.$(SUFFIX) dgehd2.$(SUFFIX) dgehrd.$(SUFFIX) dgelq2.$(SUFFIX) dgelqf.$(SUFFIX) \ ++ dgels.$(SUFFIX) dgelsd.$(SUFFIX) dgelss.$(SUFFIX) dgelsx.$(SUFFIX) dgelsy.$(SUFFIX) dgeql2.$(SUFFIX) dgeqlf.$(SUFFIX) \ ++ dgeqp3.$(SUFFIX) dgeqpf.$(SUFFIX) dgeqr2.$(SUFFIX) dgeqr2p.$(SUFFIX) dgeqrf.$(SUFFIX) dgeqrfp.$(SUFFIX) dgerfs.$(SUFFIX) \ ++ dgerq2.$(SUFFIX) dgerqf.$(SUFFIX) dgesc2.$(SUFFIX) dgesdd.$(SUFFIX) dgesv.$(SUFFIX) dgesvd.$(SUFFIX) dgesvx.$(SUFFIX) \ ++ dgetc2.$(SUFFIX) dgetri.$(SUFFIX) \ ++ dggbak.$(SUFFIX) dggbal.$(SUFFIX) dgges.$(SUFFIX) dggesx.$(SUFFIX) dggev.$(SUFFIX) dggevx.$(SUFFIX) \ ++ dggglm.$(SUFFIX) dgghrd.$(SUFFIX) dgglse.$(SUFFIX) dggqrf.$(SUFFIX) \ ++ dggrqf.$(SUFFIX) dggsvd.$(SUFFIX) dggsvp.$(SUFFIX) dgtcon.$(SUFFIX) dgtrfs.$(SUFFIX) dgtsv.$(SUFFIX) \ ++ dgtsvx.$(SUFFIX) dgttrf.$(SUFFIX) dgttrs.$(SUFFIX) dgtts2.$(SUFFIX) dhgeqz.$(SUFFIX) \ ++ dhsein.$(SUFFIX) dhseqr.$(SUFFIX) dlabrd.$(SUFFIX) dlacon.$(SUFFIX) dlacn2.$(SUFFIX) \ ++ dlaein.$(SUFFIX) dlaexc.$(SUFFIX) dlag2.$(SUFFIX) dlags2.$(SUFFIX) dlagtm.$(SUFFIX) dlagv2.$(SUFFIX) dlahqr.$(SUFFIX) \ ++ dlahrd.$(SUFFIX) dlahr2.$(SUFFIX) dlaic1.$(SUFFIX) dlaln2.$(SUFFIX) dlals0.$(SUFFIX) dlalsa.$(SUFFIX) dlalsd.$(SUFFIX) \ ++ dlangb.$(SUFFIX) dlange.$(SUFFIX) dlangt.$(SUFFIX) dlanhs.$(SUFFIX) dlansb.$(SUFFIX) dlansp.$(SUFFIX) \ ++ dlansy.$(SUFFIX) dlantb.$(SUFFIX) dlantp.$(SUFFIX) dlantr.$(SUFFIX) dlanv2.$(SUFFIX) \ ++ dlapll.$(SUFFIX) dlapmt.$(SUFFIX) \ ++ dlaqgb.$(SUFFIX) dlaqge.$(SUFFIX) dlaqp2.$(SUFFIX) dlaqps.$(SUFFIX) dlaqsb.$(SUFFIX) dlaqsp.$(SUFFIX) dlaqsy.$(SUFFIX) \ ++ dlaqr0.$(SUFFIX) dlaqr1.$(SUFFIX) dlaqr2.$(SUFFIX) dlaqr3.$(SUFFIX) dlaqr4.$(SUFFIX) dlaqr5.$(SUFFIX) \ ++ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ ++ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ ++ dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ ++ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ ++ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ ++ dorgrq.$(SUFFIX) dorgtr.$(SUFFIX) dorm2l.$(SUFFIX) dorm2r.$(SUFFIX) \ ++ dormbr.$(SUFFIX) dormhr.$(SUFFIX) dorml2.$(SUFFIX) dormlq.$(SUFFIX) dormql.$(SUFFIX) dormqr.$(SUFFIX) dormr2.$(SUFFIX) \ ++ dormr3.$(SUFFIX) dormrq.$(SUFFIX) dormrz.$(SUFFIX) dormtr.$(SUFFIX) dpbcon.$(SUFFIX) dpbequ.$(SUFFIX) dpbrfs.$(SUFFIX) \ ++ dpbstf.$(SUFFIX) dpbsv.$(SUFFIX) dpbsvx.$(SUFFIX) \ ++ dpbtf2.$(SUFFIX) dpbtrf.$(SUFFIX) dpbtrs.$(SUFFIX) dpocon.$(SUFFIX) dpoequ.$(SUFFIX) dporfs.$(SUFFIX) dposv.$(SUFFIX) \ ++ dposvx.$(SUFFIX) dpotri.$(SUFFIX) dpotrs.$(SUFFIX) dpstrf.$(SUFFIX) dpstf2.$(SUFFIX) \ ++ dppcon.$(SUFFIX) dppequ.$(SUFFIX) \ ++ dpprfs.$(SUFFIX) dppsv.$(SUFFIX) dppsvx.$(SUFFIX) dpptrf.$(SUFFIX) dpptri.$(SUFFIX) dpptrs.$(SUFFIX) dptcon.$(SUFFIX) \ ++ dpteqr.$(SUFFIX) dptrfs.$(SUFFIX) dptsv.$(SUFFIX) dptsvx.$(SUFFIX) dpttrs.$(SUFFIX) dptts2.$(SUFFIX) drscl.$(SUFFIX) \ ++ dsbev.$(SUFFIX) dsbevd.$(SUFFIX) dsbevx.$(SUFFIX) dsbgst.$(SUFFIX) dsbgv.$(SUFFIX) dsbgvd.$(SUFFIX) dsbgvx.$(SUFFIX) \ ++ dsbtrd.$(SUFFIX) dspcon.$(SUFFIX) dspev.$(SUFFIX) dspevd.$(SUFFIX) dspevx.$(SUFFIX) dspgst.$(SUFFIX) \ ++ dspgv.$(SUFFIX) dspgvd.$(SUFFIX) dspgvx.$(SUFFIX) dsprfs.$(SUFFIX) dspsv.$(SUFFIX) dspsvx.$(SUFFIX) dsptrd.$(SUFFIX) \ ++ dsptrf.$(SUFFIX) dsptri.$(SUFFIX) dsptrs.$(SUFFIX) dstegr.$(SUFFIX) dstein.$(SUFFIX) dstev.$(SUFFIX) dstevd.$(SUFFIX) dstevr.$(SUFFIX) \ ++ dstevx.$(SUFFIX) \ ++ dsycon.$(SUFFIX) dsyev.$(SUFFIX) dsyevd.$(SUFFIX) dsyevr.$(SUFFIX) \ ++ dsyevx.$(SUFFIX) dsygs2.$(SUFFIX) dsygst.$(SUFFIX) dsygv.$(SUFFIX) dsygvd.$(SUFFIX) dsygvx.$(SUFFIX) dsyrfs.$(SUFFIX) \ ++ dsysv.$(SUFFIX) dsysvx.$(SUFFIX) \ ++ dsytd2.$(SUFFIX) dsytf2.$(SUFFIX) dsytrd.$(SUFFIX) dsytrf.$(SUFFIX) dsytri.$(SUFFIX) dsytri2.$(SUFFIX) dsytri2x.$(SUFFIX) \ ++ dsyswapr.$(SUFFIX) dsytrs.$(SUFFIX) dsytrs2.$(SUFFIX) dsyconv.$(SUFFIX) \ ++ dtbcon.$(SUFFIX) dtbrfs.$(SUFFIX) dtbtrs.$(SUFFIX) dtgevc.$(SUFFIX) dtgex2.$(SUFFIX) dtgexc.$(SUFFIX) dtgsen.$(SUFFIX) \ ++ dtgsja.$(SUFFIX) dtgsna.$(SUFFIX) dtgsy2.$(SUFFIX) dtgsyl.$(SUFFIX) dtpcon.$(SUFFIX) dtprfs.$(SUFFIX) dtptri.$(SUFFIX) \ ++ dtptrs.$(SUFFIX) \ ++ dtrcon.$(SUFFIX) dtrevc.$(SUFFIX) dtrexc.$(SUFFIX) dtrrfs.$(SUFFIX) dtrsen.$(SUFFIX) dtrsna.$(SUFFIX) dtrsyl.$(SUFFIX) \ ++ dtrtrs.$(SUFFIX) dtzrqf.$(SUFFIX) dtzrzf.$(SUFFIX) dstemr.$(SUFFIX) \ ++ dsgesv.$(SUFFIX) dsposv.$(SUFFIX) dlag2s.$(SUFFIX) slag2d.$(SUFFIX) dlat2s.$(SUFFIX) \ ++ dlansf.$(SUFFIX) dpftrf.$(SUFFIX) dpftri.$(SUFFIX) dpftrs.$(SUFFIX) dsfrk.$(SUFFIX) dtfsm.$(SUFFIX) dtftri.$(SUFFIX) dtfttp.$(SUFFIX) \ ++ dtfttr.$(SUFFIX) dtpttf.$(SUFFIX) dtpttr.$(SUFFIX) dtrttf.$(SUFFIX) dtrttp.$(SUFFIX) \ ++ dgejsv.$(SUFFIX) dgesvj.$(SUFFIX) dgsvj0.$(SUFFIX) dgsvj1.$(SUFFIX) \ ++ dgeequb.$(SUFFIX) dsyequb.$(SUFFIX) dpoequb.$(SUFFIX) dgbequb.$(SUFFIX) \ ++ dbbcsd.$(SUFFIX) dlapmr.$(SUFFIX) dorbdb.$(SUFFIX) dorcsd.$(SUFFIX) \ ++ dgeqrt.$(SUFFIX) dgeqrt2.$(SUFFIX) dgeqrt3.$(SUFFIX) dgemqrt.$(SUFFIX) \ ++ dtpqrt.$(SUFFIX) dtpqrt2.$(SUFFIX) dtpmqrt.$(SUFFIX) dtprfb.$(SUFFIX) + + ifdef USEXBLAS +-DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ +- dla_gercond.o dla_gerpvgrw.o dsysvxx.o dsyrfsx.o \ +- dla_syrfsx_extended.o dla_syamv.o dla_syrcond.o dla_syrpvgrw.o \ +- dposvxx.o dporfsx.o dla_porfsx_extended.o dla_porcond.o \ +- dla_porpvgrw.o dgbsvxx.o dgbrfsx.o dla_gbrfsx_extended.o \ +- dla_gbamv.o dla_gbrcond.o dla_gbrpvgrw.o dla_lin_berr.o dlarscl2.o \ +- dlascl2.o dla_wwaddw.o ++DXLASRC = dgesvxx.$(SUFFIX) dgerfsx.$(SUFFIX) dla_gerfsx_extended.$(SUFFIX) dla_geamv.$(SUFFIX) \ ++ dla_gercond.$(SUFFIX) dla_gerpvgrw.$(SUFFIX) dsysvxx.$(SUFFIX) dsyrfsx.$(SUFFIX) \ ++ dla_syrfsx_extended.$(SUFFIX) dla_syamv.$(SUFFIX) dla_syrcond.$(SUFFIX) dla_syrpvgrw.$(SUFFIX) \ ++ dposvxx.$(SUFFIX) dporfsx.$(SUFFIX) dla_porfsx_extended.$(SUFFIX) dla_porcond.$(SUFFIX) \ ++ dla_porpvgrw.$(SUFFIX) dgbsvxx.$(SUFFIX) dgbrfsx.$(SUFFIX) dla_gbrfsx_extended.$(SUFFIX) \ ++ dla_gbamv.$(SUFFIX) dla_gbrcond.$(SUFFIX) dla_gbrpvgrw.$(SUFFIX) dla_lin_berr.$(SUFFIX) dlarscl2.$(SUFFIX) \ ++ dlascl2.$(SUFFIX) dla_wwaddw.$(SUFFIX) + endif + + ZLASRC = \ +- zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ +- zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ +- zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ +- zgegs.o zgegv.o zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ +- zgels.o zgelsd.o zgelss.o zgelsx.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ +- zgeqpf.o zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \ +- zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvx.o zgetc2.o zgetf2.o zgetrf.o \ +- zgetri.o zgetrs.o \ +- zggbak.o zggbal.o zgges.o zggesx.o zggev.o zggevx.o zggglm.o \ +- zgghrd.o zgglse.o zggqrf.o zggrqf.o \ +- zggsvd.o zggsvp.o \ +- zgtcon.o zgtrfs.o zgtsv.o zgtsvx.o zgttrf.o zgttrs.o zgtts2.o zhbev.o \ +- zhbevd.o zhbevx.o zhbgst.o zhbgv.o zhbgvd.o zhbgvx.o zhbtrd.o \ +- zhecon.o zheev.o zheevd.o zheevr.o zheevx.o zhegs2.o zhegst.o \ +- zhegv.o zhegvd.o zhegvx.o zherfs.o zhesv.o zhesvx.o zhetd2.o \ +- zhetf2.o zhetrd.o \ +- zhetrf.o zhetri.o zhetri2.o zhetri2x.o zheswapr.o \ +- zhetrs.o zhetrs2.o zhgeqz.o zhpcon.o zhpev.o zhpevd.o \ +- zhpevx.o zhpgst.o zhpgv.o zhpgvd.o zhpgvx.o zhprfs.o zhpsv.o \ +- zhpsvx.o \ +- zhptrd.o zhptrf.o zhptri.o zhptrs.o zhsein.o zhseqr.o zlabrd.o \ +- zlacgv.o zlacon.o zlacn2.o zlacp2.o zlacpy.o zlacrm.o zlacrt.o zladiv.o \ +- zlaed0.o zlaed7.o zlaed8.o \ +- zlaein.o zlaesy.o zlaev2.o zlags2.o zlagtm.o \ +- zlahef.o zlahqr.o \ +- zlahrd.o zlahr2.o zlaic1.o zlals0.o zlalsa.o zlalsd.o zlangb.o zlange.o \ +- zlangt.o zlanhb.o \ +- zlanhe.o \ +- zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ +- zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ +- zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ +- zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ +- zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ +- zlarcm.o zlarf.o zlarfb.o \ +- zlarfg.o zlarft.o zlarfgp.o \ +- zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ +- zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ +- zlassq.o zlaswp.o zlasyf.o \ +- zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ +- zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ +- zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ +- zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ +- zppcon.o zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ +- zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ +- zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ +- zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \ +- zstegr.o zstein.o zsteqr.o \ +- zsycon.o zsymv.o \ +- zsyr.o zsyrfs.o zsysv.o zsysvx.o zsytf2.o zsytrf.o zsytri.o zsytri2.o zsytri2x.o \ +- zsyswapr.o zsytrs.o zsytrs2.o zsyconv.o \ +- ztbcon.o ztbrfs.o ztbtrs.o ztgevc.o ztgex2.o \ +- ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ +- ztprfs.o ztptri.o \ +- ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ +- ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ +- zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ +- zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ +- zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ +- zunmtr.o zupgtr.o \ +- zupmtr.o izmax1.o dzsum1.o zstemr.o \ +- zcgesv.o zcposv.o zlag2c.o clag2z.o zlat2c.o \ +- zhfrk.o ztfttp.o zlanhf.o zpftrf.o zpftri.o zpftrs.o ztfsm.o ztftri.o \ +- ztfttr.o ztpttf.o ztpttr.o ztrttf.o ztrttp.o \ +- zgeequb.o zgbequb.o zsyequb.o zpoequb.o zheequb.o \ +- zbbcsd.o zlapmr.o zunbdb.o zuncsd.o \ +- zgeqrt.o zgeqrt2.o zgeqrt3.o zgemqrt.o \ +- ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o ++ zbdsqr.$(SUFFIX) zgbbrd.$(SUFFIX) zgbcon.$(SUFFIX) zgbequ.$(SUFFIX) zgbrfs.$(SUFFIX) zgbsv.$(SUFFIX) zgbsvx.$(SUFFIX) \ ++ zgbtf2.$(SUFFIX) zgbtrf.$(SUFFIX) zgbtrs.$(SUFFIX) zgebak.$(SUFFIX) zgebal.$(SUFFIX) zgebd2.$(SUFFIX) zgebrd.$(SUFFIX) \ ++ zgecon.$(SUFFIX) zgeequ.$(SUFFIX) zgees.$(SUFFIX) zgeesx.$(SUFFIX) zgeev.$(SUFFIX) zgeevx.$(SUFFIX) \ ++ zgegs.$(SUFFIX) zgegv.$(SUFFIX) zgehd2.$(SUFFIX) zgehrd.$(SUFFIX) zgelq2.$(SUFFIX) zgelqf.$(SUFFIX) \ ++ zgels.$(SUFFIX) zgelsd.$(SUFFIX) zgelss.$(SUFFIX) zgelsx.$(SUFFIX) zgelsy.$(SUFFIX) zgeql2.$(SUFFIX) zgeqlf.$(SUFFIX) zgeqp3.$(SUFFIX) \ ++ zgeqpf.$(SUFFIX) zgeqr2.$(SUFFIX) zgeqr2p.$(SUFFIX) zgeqrf.$(SUFFIX) zgeqrfp.$(SUFFIX) zgerfs.$(SUFFIX) zgerq2.$(SUFFIX) zgerqf.$(SUFFIX) \ ++ zgesc2.$(SUFFIX) zgesdd.$(SUFFIX) zgesv.$(SUFFIX) zgesvd.$(SUFFIX) zgesvx.$(SUFFIX) zgetc2.$(SUFFIX) \ ++ zgetri.$(SUFFIX) \ ++ zggbak.$(SUFFIX) zggbal.$(SUFFIX) zgges.$(SUFFIX) zggesx.$(SUFFIX) zggev.$(SUFFIX) zggevx.$(SUFFIX) zggglm.$(SUFFIX) \ ++ zgghrd.$(SUFFIX) zgglse.$(SUFFIX) zggqrf.$(SUFFIX) zggrqf.$(SUFFIX) \ ++ zggsvd.$(SUFFIX) zggsvp.$(SUFFIX) \ ++ zgtcon.$(SUFFIX) zgtrfs.$(SUFFIX) zgtsv.$(SUFFIX) zgtsvx.$(SUFFIX) zgttrf.$(SUFFIX) zgttrs.$(SUFFIX) zgtts2.$(SUFFIX) zhbev.$(SUFFIX) \ ++ zhbevd.$(SUFFIX) zhbevx.$(SUFFIX) zhbgst.$(SUFFIX) zhbgv.$(SUFFIX) zhbgvd.$(SUFFIX) zhbgvx.$(SUFFIX) zhbtrd.$(SUFFIX) \ ++ zhecon.$(SUFFIX) zheev.$(SUFFIX) zheevd.$(SUFFIX) zheevr.$(SUFFIX) zheevx.$(SUFFIX) zhegs2.$(SUFFIX) zhegst.$(SUFFIX) \ ++ zhegv.$(SUFFIX) zhegvd.$(SUFFIX) zhegvx.$(SUFFIX) zherfs.$(SUFFIX) zhesv.$(SUFFIX) zhesvx.$(SUFFIX) zhetd2.$(SUFFIX) \ ++ zhetf2.$(SUFFIX) zhetrd.$(SUFFIX) \ ++ zhetrf.$(SUFFIX) zhetri.$(SUFFIX) zhetri2.$(SUFFIX) zhetri2x.$(SUFFIX) zheswapr.$(SUFFIX) \ ++ zhetrs.$(SUFFIX) zhetrs2.$(SUFFIX) zhgeqz.$(SUFFIX) zhpcon.$(SUFFIX) zhpev.$(SUFFIX) zhpevd.$(SUFFIX) \ ++ zhpevx.$(SUFFIX) zhpgst.$(SUFFIX) zhpgv.$(SUFFIX) zhpgvd.$(SUFFIX) zhpgvx.$(SUFFIX) zhprfs.$(SUFFIX) zhpsv.$(SUFFIX) \ ++ zhpsvx.$(SUFFIX) \ ++ zhptrd.$(SUFFIX) zhptrf.$(SUFFIX) zhptri.$(SUFFIX) zhptrs.$(SUFFIX) zhsein.$(SUFFIX) zhseqr.$(SUFFIX) zlabrd.$(SUFFIX) \ ++ zlacgv.$(SUFFIX) zlacon.$(SUFFIX) zlacn2.$(SUFFIX) zlacp2.$(SUFFIX) zlacpy.$(SUFFIX) zlacrm.$(SUFFIX) zlacrt.$(SUFFIX) zladiv.$(SUFFIX) \ ++ zlaed0.$(SUFFIX) zlaed7.$(SUFFIX) zlaed8.$(SUFFIX) \ ++ zlaein.$(SUFFIX) zlaesy.$(SUFFIX) zlaev2.$(SUFFIX) zlags2.$(SUFFIX) zlagtm.$(SUFFIX) \ ++ zlahef.$(SUFFIX) zlahqr.$(SUFFIX) \ ++ zlahrd.$(SUFFIX) zlahr2.$(SUFFIX) zlaic1.$(SUFFIX) zlals0.$(SUFFIX) zlalsa.$(SUFFIX) zlalsd.$(SUFFIX) zlangb.$(SUFFIX) zlange.$(SUFFIX) \ ++ zlangt.$(SUFFIX) zlanhb.$(SUFFIX) \ ++ zlanhe.$(SUFFIX) \ ++ zlanhp.$(SUFFIX) zlanhs.$(SUFFIX) zlanht.$(SUFFIX) zlansb.$(SUFFIX) zlansp.$(SUFFIX) zlansy.$(SUFFIX) zlantb.$(SUFFIX) \ ++ zlantp.$(SUFFIX) zlantr.$(SUFFIX) zlapll.$(SUFFIX) zlapmt.$(SUFFIX) zlaqgb.$(SUFFIX) zlaqge.$(SUFFIX) \ ++ zlaqhb.$(SUFFIX) zlaqhe.$(SUFFIX) zlaqhp.$(SUFFIX) zlaqp2.$(SUFFIX) zlaqps.$(SUFFIX) zlaqsb.$(SUFFIX) \ ++ zlaqr0.$(SUFFIX) zlaqr1.$(SUFFIX) zlaqr2.$(SUFFIX) zlaqr3.$(SUFFIX) zlaqr4.$(SUFFIX) zlaqr5.$(SUFFIX) \ ++ zlaqsp.$(SUFFIX) zlaqsy.$(SUFFIX) zlar1v.$(SUFFIX) zlar2v.$(SUFFIX) ilazlr.$(SUFFIX) ilazlc.$(SUFFIX) \ ++ zlarcm.$(SUFFIX) zlarf.$(SUFFIX) zlarfb.$(SUFFIX) \ ++ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ ++ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ ++ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ ++ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ ++ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ ++ zposv.$(SUFFIX) zposvx.$(SUFFIX) zpotri.$(SUFFIX) zpotrs.$(SUFFIX) zpstrf.$(SUFFIX) zpstf2.$(SUFFIX) \ ++ zppcon.$(SUFFIX) zppequ.$(SUFFIX) zpprfs.$(SUFFIX) zppsv.$(SUFFIX) zppsvx.$(SUFFIX) zpptrf.$(SUFFIX) zpptri.$(SUFFIX) zpptrs.$(SUFFIX) \ ++ zptcon.$(SUFFIX) zpteqr.$(SUFFIX) zptrfs.$(SUFFIX) zptsv.$(SUFFIX) zptsvx.$(SUFFIX) zpttrf.$(SUFFIX) zpttrs.$(SUFFIX) zptts2.$(SUFFIX) \ ++ zrot.$(SUFFIX) zspcon.$(SUFFIX) zspmv.$(SUFFIX) zspr.$(SUFFIX) zsprfs.$(SUFFIX) zspsv.$(SUFFIX) \ ++ zspsvx.$(SUFFIX) zsptrf.$(SUFFIX) zsptri.$(SUFFIX) zsptrs.$(SUFFIX) zdrscl.$(SUFFIX) zstedc.$(SUFFIX) \ ++ zstegr.$(SUFFIX) zstein.$(SUFFIX) zsteqr.$(SUFFIX) \ ++ zsycon.$(SUFFIX) zsymv.$(SUFFIX) \ ++ zsyr.$(SUFFIX) zsyrfs.$(SUFFIX) zsysv.$(SUFFIX) zsysvx.$(SUFFIX) zsytf2.$(SUFFIX) zsytrf.$(SUFFIX) zsytri.$(SUFFIX) zsytri2.$(SUFFIX) zsytri2x.$(SUFFIX) \ ++ zsyswapr.$(SUFFIX) zsytrs.$(SUFFIX) zsytrs2.$(SUFFIX) zsyconv.$(SUFFIX) \ ++ ztbcon.$(SUFFIX) ztbrfs.$(SUFFIX) ztbtrs.$(SUFFIX) ztgevc.$(SUFFIX) ztgex2.$(SUFFIX) \ ++ ztgexc.$(SUFFIX) ztgsen.$(SUFFIX) ztgsja.$(SUFFIX) ztgsna.$(SUFFIX) ztgsy2.$(SUFFIX) ztgsyl.$(SUFFIX) ztpcon.$(SUFFIX) \ ++ ztprfs.$(SUFFIX) ztptri.$(SUFFIX) \ ++ ztptrs.$(SUFFIX) ztrcon.$(SUFFIX) ztrevc.$(SUFFIX) ztrexc.$(SUFFIX) ztrrfs.$(SUFFIX) ztrsen.$(SUFFIX) ztrsna.$(SUFFIX) \ ++ ztrsyl.$(SUFFIX) ztrtrs.$(SUFFIX) ztzrqf.$(SUFFIX) ztzrzf.$(SUFFIX) zung2l.$(SUFFIX) \ ++ zung2r.$(SUFFIX) zungbr.$(SUFFIX) zunghr.$(SUFFIX) zungl2.$(SUFFIX) zunglq.$(SUFFIX) zungql.$(SUFFIX) zungqr.$(SUFFIX) zungr2.$(SUFFIX) \ ++ zungrq.$(SUFFIX) zungtr.$(SUFFIX) zunm2l.$(SUFFIX) zunm2r.$(SUFFIX) zunmbr.$(SUFFIX) zunmhr.$(SUFFIX) zunml2.$(SUFFIX) \ ++ zunmlq.$(SUFFIX) zunmql.$(SUFFIX) zunmqr.$(SUFFIX) zunmr2.$(SUFFIX) zunmr3.$(SUFFIX) zunmrq.$(SUFFIX) zunmrz.$(SUFFIX) \ ++ zunmtr.$(SUFFIX) zupgtr.$(SUFFIX) \ ++ zupmtr.$(SUFFIX) izmax1.$(SUFFIX) dzsum1.$(SUFFIX) zstemr.$(SUFFIX) \ ++ zcgesv.$(SUFFIX) zcposv.$(SUFFIX) zlag2c.$(SUFFIX) clag2z.$(SUFFIX) zlat2c.$(SUFFIX) \ ++ zhfrk.$(SUFFIX) ztfttp.$(SUFFIX) zlanhf.$(SUFFIX) zpftrf.$(SUFFIX) zpftri.$(SUFFIX) zpftrs.$(SUFFIX) ztfsm.$(SUFFIX) ztftri.$(SUFFIX) \ ++ ztfttr.$(SUFFIX) ztpttf.$(SUFFIX) ztpttr.$(SUFFIX) ztrttf.$(SUFFIX) ztrttp.$(SUFFIX) \ ++ zgeequb.$(SUFFIX) zgbequb.$(SUFFIX) zsyequb.$(SUFFIX) zpoequb.$(SUFFIX) zheequb.$(SUFFIX) \ ++ zbbcsd.$(SUFFIX) zlapmr.$(SUFFIX) zunbdb.$(SUFFIX) zuncsd.$(SUFFIX) \ ++ zgeqrt.$(SUFFIX) zgeqrt2.$(SUFFIX) zgeqrt3.$(SUFFIX) zgemqrt.$(SUFFIX) \ ++ ztpqrt.$(SUFFIX) ztpqrt2.$(SUFFIX) ztpmqrt.$(SUFFIX) ztprfb.$(SUFFIX) + + ifdef USEXBLAS +-ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ +- zla_gercond_c.o zla_gercond_x.o zla_gerpvgrw.o zsysvxx.o zsyrfsx.o \ +- zla_syrfsx_extended.o zla_syamv.o zla_syrcond_c.o zla_syrcond_x.o \ +- zla_syrpvgrw.o zposvxx.o zporfsx.o zla_porfsx_extended.o \ +- zla_porcond_c.o zla_porcond_x.o zla_porpvgrw.o zgbsvxx.o zgbrfsx.o \ +- zla_gbrfsx_extended.o zla_gbamv.o zla_gbrcond_c.o zla_gbrcond_x.o \ +- zla_gbrpvgrw.o zhesvxx.o zherfsx.o zla_herfsx_extended.o \ +- zla_heamv.o zla_hercond_c.o zla_hercond_x.o zla_herpvgrw.o \ +- zla_lin_berr.o zlarscl2.o zlascl2.o zla_wwaddw.o ++ZXLASRC = zgesvxx.$(SUFFIX) zgerfsx.$(SUFFIX) zla_gerfsx_extended.$(SUFFIX) zla_geamv.$(SUFFIX) \ ++ zla_gercond_c.$(SUFFIX) zla_gercond_x.$(SUFFIX) zla_gerpvgrw.$(SUFFIX) zsysvxx.$(SUFFIX) zsyrfsx.$(SUFFIX) \ ++ zla_syrfsx_extended.$(SUFFIX) zla_syamv.$(SUFFIX) zla_syrcond_c.$(SUFFIX) zla_syrcond_x.$(SUFFIX) \ ++ zla_syrpvgrw.$(SUFFIX) zposvxx.$(SUFFIX) zporfsx.$(SUFFIX) zla_porfsx_extended.$(SUFFIX) \ ++ zla_porcond_c.$(SUFFIX) zla_porcond_x.$(SUFFIX) zla_porpvgrw.$(SUFFIX) zgbsvxx.$(SUFFIX) zgbrfsx.$(SUFFIX) \ ++ zla_gbrfsx_extended.$(SUFFIX) zla_gbamv.$(SUFFIX) zla_gbrcond_c.$(SUFFIX) zla_gbrcond_x.$(SUFFIX) \ ++ zla_gbrpvgrw.$(SUFFIX) zhesvxx.$(SUFFIX) zherfsx.$(SUFFIX) zla_herfsx_extended.$(SUFFIX) \ ++ zla_heamv.$(SUFFIX) zla_hercond_c.$(SUFFIX) zla_hercond_x.$(SUFFIX) zla_herpvgrw.$(SUFFIX) \ ++ zla_lin_berr.$(SUFFIX) zlarscl2.$(SUFFIX) zlascl2.$(SUFFIX) zla_wwaddw.$(SUFFIX) + endif + + ALLOBJ = $(SLASRC) $(DLASRC) $(DSLASRC) $(CLASRC) $(ZLASRC) $(ZCLASRC) \ + $(SCLAUX) $(DZLAUX) $(ALLAUX) + ++ALLOBJ_P = $(ALLOBJ:.$(SUFFIX)=.$(PSUFFIX)) ++ + ifdef USEXBLAS + ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) + endif + + all: ../$(LAPACKLIB) + ++lapack_prof: ../$(LAPACKLIB_P) ++ + ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) + $(RANLIB) $@ + ++../$(LAPACKLIB_P): $(ALLOBJ_P) ++ $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ_P) ++ $(RANLIB) $@ ++ + single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) +@@ -451,15 +459,24 @@ + @FRC=$(FRC) + + clean: +- rm -f *.o ++ rm -f *.$(SUFFIX) *.$(PSUFFIX) + +-.f.o: ++%.$(SUFFIX): %.f + $(FORTRAN) $(OPTS) -c $< -o $@ + +-slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ ++%.$(PSUFFIX): %.f ++ $(FORTRAN) $(POPTS) -c $< -o $@ + ++slaruv.$(SUFFIX): slaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dlaruv.$(SUFFIX): dlaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(SUFFIX): sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(SUFFIX): dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(SUFFIX): cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(SUFFIX): zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++ ++slaruv.$(PSUFFIX): slaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dlaruv.$(PSUFFIX): dlaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(PSUFFIX): sla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(PSUFFIX): dla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(PSUFFIX): cla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(PSUFFIX): zla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ +diff -ruN lapack-3.4.2.old/TESTING/EIG/Makefile lapack-3.4.2/TESTING/EIG/Makefile +--- lapack-3.4.2.old/TESTING/EIG/Makefile 2011-09-26 23:52:31 +0200 ++++ lapack-3.4.2/TESTING/EIG/Makefile 2012-04-22 21:41:45 +0200 +@@ -78,7 +78,7 @@ + cget35.o cget36.o cget37.o cget38.o cget51.o cget52.o \ + cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts.o \ + chbt21.o chet21.o chet22.o chpt21.o chst01.o \ +- clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ ++ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o \ + csgt01.o cslect.o \ + cstt21.o cstt22.o cunt01.o cunt03.o + +@@ -115,7 +115,7 @@ + zget35.o zget36.o zget37.o zget38.o zget51.o zget52.o \ + zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts.o \ + zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ +- zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ ++ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o \ + zsgt01.o zslect.o \ + zstt21.o zstt22.o zunt01.o zunt03.o + +@@ -129,22 +129,22 @@ + ../xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtsts \ + $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtsts $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtsts $@ + + ../xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstc \ + $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstc $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstc $@ + + ../xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstd \ + $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstd $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstd $@ + + ../xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstz \ + $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstz $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstz $@ + + $(AEIGTST): $(FRC) + $(SCIGTST): $(FRC) +diff -ruN lapack-3.4.2.old/TESTING/LIN/Makefile lapack-3.4.2/TESTING/LIN/Makefile +--- lapack-3.4.2.old/TESTING/LIN/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.2/TESTING/LIN/Makefile 2012-04-22 21:43:30 +0200 +@@ -109,7 +109,7 @@ + cqpt01.o cqrt01.o cqrt01p.o cqrt02.o cqrt03.o cqrt11.o \ + cqrt12.o cqrt13.o cqrt14.o cqrt15.o cqrt16.o \ + cqrt17.o crqt01.o crqt02.o crqt03.o crzt01.o crzt02.o \ +- csbmv.o cspt01.o \ ++ cspt01.o \ + cspt02.o cspt03.o csyt01.o csyt02.o csyt03.o \ + ctbt02.o ctbt03.o ctbt05.o ctbt06.o ctpt01.o \ + ctpt02.o ctpt03.o ctpt05.o ctpt06.o ctrt01.o \ +@@ -188,7 +188,7 @@ + zqpt01.o zqrt01.o zqrt01p.o zqrt02.o zqrt03.o zqrt11.o \ + zqrt12.o zqrt13.o zqrt14.o zqrt15.o zqrt16.o \ + zqrt17.o zrqt01.o zrqt02.o zrqt03.o zrzt01.o zrzt02.o \ +- zsbmv.o zspt01.o \ ++ zspt01.o \ + zspt02.o zspt03.o zsyt01.o zsyt02.o zsyt03.o \ + ztbt02.o ztbt03.o ztbt05.o ztbt06.o ztpt01.o \ + ztpt02.o ztpt03.o ztpt05.o ztpt06.o ztrt01.o \ +@@ -214,7 +214,7 @@ + zdrvab.o zdrvac.o zerrab.o zerrac.o zget08.o \ + alaerh.o alahd.o aladhd.o alareq.o \ + chkxer.o zget02.o zlarhs.o zlatb4.o \ +- zsbmv.o xerbla.o zpot06.o zlaipd.o ++ xerbla.o zpot06.o zlaipd.o + + SLINTSTRFP = schkrfp.o sdrvrfp.o sdrvrf1.o sdrvrf2.o sdrvrf3.o sdrvrf4.o serrrfp.o \ + slatb4.o slarhs.o sget04.o spot01.o spot03.o spot02.o \ +@@ -225,11 +225,11 @@ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + CLINTSTRFP = cchkrfp.o cdrvrfp.o cdrvrf1.o cdrvrf2.o cdrvrf3.o cdrvrf4.o cerrrfp.o \ +- claipd.o clatb4.o clarhs.o csbmv.o cget04.o cpot01.o cpot03.o cpot02.o \ ++ claipd.o clatb4.o clarhs.o cget04.o cpot01.o cpot03.o cpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + ZLINTSTRFP = zchkrfp.o zdrvrfp.o zdrvrf1.o zdrvrf2.o zdrvrf3.o zdrvrf4.o zerrrfp.o \ +- zlatb4.o zlaipd.o zlarhs.o zsbmv.o zget04.o zpot01.o zpot03.o zpot02.o \ ++ zlatb4.o zlaipd.o zlarhs.o zget04.o zpot01.o zpot03.o zpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + all: single double complex complex16 proto-single proto-double proto-complex proto-complex16 +@@ -246,43 +246,43 @@ + + xlintsts : $(ALINTST) $(SLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(SLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstc : $(ALINTST) $(CLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(CLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstd : $(ALINTST) $(DLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $^ \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstz : $(ALINTST) $(ZLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(ZLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstds : $(DSLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DSLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstzc : $(ZCLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZCLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfs : $(SLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(SLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfd : $(DLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfc : $(CLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(CLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfz : $(ZLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintsts: xlintsts + mv xlintsts $@ +diff -ruN lapack-3.4.2.old/lapacke/src/Makefile lapack-3.4.2/lapacke/src/Makefile +--- lapack-3.4.2.old/lapacke/src/Makefile 2012-09-21 04:21:29 +0200 ++++ lapack-3.4.2/lapacke/src/Makefile 2012-10-15 22:04:56 +0200 +@@ -34,7 +34,7 @@ + # + include ../../make.inc + +-SRC_OBJ = \ ++CSRC_OBJ = \ + lapacke_cbbcsd.o \ + lapacke_cbbcsd_work.o \ + lapacke_cbdsqr.o \ +@@ -526,7 +526,9 @@ + lapacke_cupgtr.o \ + lapacke_cupgtr_work.o \ + lapacke_cupmtr.o \ +-lapacke_cupmtr_work.o \ ++lapacke_cupmtr_work.o ++ ++DSRC_OBJ = \ + lapacke_dbbcsd.o \ + lapacke_dbbcsd_work.o \ + lapacke_dbdsdc.o \ +@@ -1012,7 +1014,9 @@ + lapacke_dtrttp.o \ + lapacke_dtrttp_work.o \ + lapacke_dtzrzf.o \ +-lapacke_dtzrzf_work.o \ ++lapacke_dtzrzf_work.o ++ ++SSRC_OBJ = \ + lapacke_sbbcsd.o \ + lapacke_sbbcsd_work.o \ + lapacke_sbdsdc.o \ +@@ -1492,7 +1496,9 @@ + lapacke_strttp.o \ + lapacke_strttp_work.o \ + lapacke_stzrzf.o \ +-lapacke_stzrzf_work.o \ ++lapacke_stzrzf_work.o ++ ++ZSRC_OBJ = \ + lapacke_zbbcsd.o \ + lapacke_zbbcsd_work.o \ + lapacke_zbdsqr.o \ +@@ -2041,19 +2047,29 @@ + lapacke_zlagsy.o \ + lapacke_zlagsy_work.o + +-ALLOBJ = $(SRC_OBJ) $(MATGEN_OBJ) ++COBJ_FILES := $(CSRC_OBJ) ++SOBJ_FILES := $(SSRC_OBJ) ++DOBJ_FILES := $(DSRC_OBJ) ++ZOBJ_FILES := $(ZSRC_OBJ) + +-ifdef USEXBLAS +-ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) ++ifdef LAPACKE_EXTENDED ++OBJ_FILES += $(SRCX_OBJ) + endif + +- +-OBJ_FILES := $(C_FILES:.o=.o) ++ifdef LAPACKE_TESTING ++OBJ_FILES += $(MATGEN_OBJ) ++endif + + all: ../../$(LAPACKELIB) + +-../../$(LAPACKELIB): $(ALLOBJ) $(ALLXOBJ) +- $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJ) $(ALLXOBJ) ++../../$(LAPACKELIB): $(COBJ_FILES) $(DOBJ_FILES) $(SOBJ_FILES) $(ZOBJ_FILES) $(OBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(COBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(DOBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(SOBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ZOBJ_FILES) ++ifneq ($(strip $(OBJ_FILES)),) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(OBJ_FILES) ++endif + $(RANLIB) ../../$(LAPACKELIB) + + .c.o: +diff -ruN lapack-3.4.2.old/lapacke/example/Makefile lapack-3.4.2/lapacke/example/Makefile +--- lapack-3.4.2.old/lapacke/example/Makefile 2012-03-23 06:55:22.000000000 +0800 ++++ lapack-3.4.2/lapacke/example/Makefile 2012-11-13 00:32:24.125449952 +0800 +@@ -4,12 +4,12 @@ + + xexample_DGESV_rowmajor: example_DGESV_rowmajor.o ../../$(LAPACKLIB) ../../$(LAPACKELIB) + $(LOADER) $(LOADOPTS) example_DGESV_rowmajor.o \ +- ../../$(LAPACKELIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(LAPACKELIB) $(CEXTRALIB) -o $@ + ./$@ + + xexample_ZGESV_rowmajor: example_ZGESV_rowmajor.o ../../$(LAPACKLIB) ../../$(LAPACKELIB) + $(LOADER) $(LOADOPTS) example_ZGESV_rowmajor.o \ +- ../../$(LAPACKELIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(LAPACKELIB) $(CEXTRALIB) -o $@ + ./$@ + + .c.o: diff --git a/quickbuild.win32 b/quickbuild.win32 index 29949c192..3d7db1770 100644 --- a/quickbuild.win32 +++ b/quickbuild.win32 @@ -1,3 +1,4 @@ #!/bin/bash +echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=32 CC=gcc FC=gfortran diff --git a/quickbuild.win64 b/quickbuild.win64 index 88f748a8d..8f0189435 100644 --- a/quickbuild.win64 +++ b/quickbuild.win64 @@ -1,3 +1,4 @@ #!/bin/bash +echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran diff --git a/segfaults.patch b/segfaults.patch new file mode 100644 index 000000000..375ab766c --- /dev/null +++ b/segfaults.patch @@ -0,0 +1,12 @@ +diff -ruN common_linux.h.orig common_linux.h +--- common_linux.h.orig 2012-04-23 11:27:55.000000000 +0800 ++++ common_linux.h 2012-05-08 23:43:00.000000000 +0800 +@@ -77,7 +77,7 @@ + #else + //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 + // unsigned long null_nodemask=0; +- return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); ++ return 0; + #endif + } + diff --git a/utest/Makefile b/utest/Makefile index e7c5f3412..3d120f5b3 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -3,20 +3,50 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system TARGET=openblas_utest -CUNIT_LIB=/usr/local/lib/libcunit.a -OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o +CUNIT_URL=http://downloads.sourceforge.net/project/cunit/CUnit/2.1-2/CUnit-2.1-2-src.tar.bz2 +CUNIT_DIR=$(CURDIR)/CUnit-2.1-2 + +CUNIT_LIB=$(CUNIT_DIR)/lib/libcunit.a + +CFLAGS+=-I$(CUNIT_DIR)/include + +OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_amax.o all : run_test -$(TARGET): $(OBJS) - $(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) +CUnit-2.1-2-src.tar.bz2: +ifeq ($(OSNAME), Darwin) + curl -O $(CUNIT_URL) +else + wget $(CUNIT_URL) +endif + +$(CUNIT_DIR): CUnit-2.1-2-src.tar.bz2 + @if test `$(MD5SUM) CUnit-2.1-2-src.tar.bz2 | $(AWK) '{print $$1}'` = 31c62bd7a65007737ba28b7aafc44d3a; then \ + echo $(TAR) xjf $< ;\ + $(TAR) xjf $< ; \ + else \ + rm -rf $(CUNIT_DIR) ;\ + echo " Cannot download CUnit-2.1-2-src.tar.bz2 or the MD5 check sum is wrong (Please use orignal)."; \ + exit 1; \ + fi + + +$(CUNIT_LIB): $(CUNIT_DIR) + (cd $(CUNIT_DIR); CC=$(CC) CFLAGS="$(CFLAGS)" ./configure --prefix=$(CUNIT_DIR)) + $(MAKE) -C $(CUNIT_DIR) + $(MAKE) -C $(CUNIT_DIR) install + +$(TARGET): $(CUNIT_LIB) $(OBJS) + $(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) run_test: $(TARGET) ./$(TARGET) clean: - rm -f *.o $(TARGET) + -rm -f *.o $(TARGET) + -rm -rf $(CUNIT_DIR) libs: diff --git a/utest/common_utest.h b/utest/common_utest.h index 1332ef6ab..e57ae0556 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -61,4 +61,6 @@ void test_drotmg(void); void test_dsdot_n_1(void); +void test_samax(void); + #endif diff --git a/utest/main.c b/utest/main.c index 135709507..ece94dd71 100644 --- a/utest/main.c +++ b/utest/main.c @@ -58,6 +58,8 @@ CU_TestInfo test_level1[]={ {"Testing drotmg",test_drotmg}, {"Testing dsdot with n == 1",test_dsdot_n_1}, + + {"Testing samax", test_samax}, CU_TEST_INFO_NULL, }; diff --git a/utest/test_amax.c b/utest/test_amax.c new file mode 100644 index 000000000..8d163853a --- /dev/null +++ b/utest/test_amax.c @@ -0,0 +1,46 @@ +/***************************************************************************** +Copyright (c) 2011-2012, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" + +void test_samax() +{ + int N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, 2.2, -3.3}; + + te_max=BLASFUNC(samax)(&N, x, &inc); + + tr_max=BLASFUNC_REF(samax)(&N, x, &inc); + + CU_ASSERT_DOUBLE_EQUAL(te_max, tr_max, CHECK_EPS); +} diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index e51e6b299..9a1a3d084 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -38,12 +38,18 @@ void test_drotmg() double te_d2, tr_d2; double te_x1, tr_x1; double te_y1, tr_y1; - double te_param[5],tr_param[5]; + double te_param[5]; + double tr_param[5]; int i=0; te_d1= tr_d1=0.21149573940783739; te_d2= tr_d2=0.046892057172954082; te_x1= tr_x1=-0.42272687517106533; te_y1= tr_y1=0.42211309121921659; + + for(i=0; i<5; i++){ + te_param[i]=tr_param[i]=0.0; + } + //OpenBLAS BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); //reference