Merge branch 'release-0.1.0'

This commit is contained in:
Xianyi Zhang 2012-03-23 18:52:40 +08:00
commit 09f74f6d23
55 changed files with 27107 additions and 78 deletions

View File

@ -1,4 +1,22 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.1.0
23-Mar-2012
common:
* Set soname of shared library on Linux.
* Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
this flag to control the library name, e.g. libopenblas.a,
libopenblas_ifort.a or libopenblas_omp.a.
* Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule.
The lib use single thread in GEMM function with small matrices.
x86/x86_64:
* Used GEMV SSE/SSE2 kernels on x86 32-bit.
* Exported CBLAS functions in Windows DLL.
MIPS64:
* Completed Level-3 BLAS optimization on Loongson 3A CPU.
* Improved GEMV performance on Loongson 3A CPU.
* Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT)
==================================================================== ====================================================================
Version 0.1 alpha2.5 Version 0.1 alpha2.5
19-Feb-2012 19-Feb-2012

View File

@ -82,27 +82,28 @@ endif
shared : shared :
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
$(MAKE) -C exports so $(MAKE) -C exports so
-ln -fs $(LIBSONAME) libopenblas.so -ln -fs $(LIBSONAME) $(LIBPREFIX).so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), FreeBSD)
$(MAKE) -C exports so $(MAKE) -C exports so
-ln -fs $(LIBSONAME) libopenblas.so -ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), NetBSD) ifeq ($(OSNAME), NetBSD)
$(MAKE) -C exports so $(MAKE) -C exports so
-ln -fs $(LIBSONAME) libopenblas.so -ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
$(MAKE) -C exports dyn $(MAKE) -C exports dyn
-ln -fs $(LIBDYNNAME) libopenblas.dylib -ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll $(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) libopenblas.dll -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll $(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) libopenblas.dll -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
endif endif
tests : tests :
@ -130,7 +131,7 @@ endif
ifeq ($(NOFORTRAN), 1) ifeq ($(NOFORTRAN), 1)
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
endif endif
-ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) -ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(SUBDIRS) ; \ for d in $(SUBDIRS) ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
@ -158,7 +159,7 @@ endif
prof : prof_blas prof_lapack prof : prof_blas prof_lapack
prof_blas : prof_blas :
ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX) ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
for d in $(SUBDIRS) ; \ for d in $(SUBDIRS) ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d prof || exit 1 ; \ $(MAKE) -C $$d prof || exit 1 ; \
@ -169,7 +170,7 @@ ifdef DYNAMIC_ARCH
endif endif
blas : blas :
ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ; \ for d in $(BLASDIRS) ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d libs || exit 1 ; \ $(MAKE) -C $$d libs || exit 1 ; \
@ -177,7 +178,7 @@ blas :
done done
hpl : hpl :
ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ../laswp exports ; \ for d in $(BLASDIRS) ../laswp exports ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
@ -191,7 +192,7 @@ ifdef DYNAMIC_ARCH
endif endif
hpl_p : hpl_p :
ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX) ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
for d in $(SUBDIRS) ../laswp exports ; \ for d in $(SUBDIRS) ../laswp exports ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
@ -285,7 +286,8 @@ clean ::
#ifdef DYNAMIC_ARCH #ifdef DYNAMIC_ARCH
@$(MAKE) -C kernel clean @$(MAKE) -C kernel clean
#endif #endif
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h @$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d lapack-3.4.0; then \ @if test -d lapack-3.4.0; then \
echo deleting lapack-3.4.0; \ echo deleting lapack-3.4.0; \

View File

@ -38,33 +38,34 @@ install : lib.grd
#for install static library #for install static library
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR) @cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.$(LIBSUFFIX) @-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX)
#for install shared library #for install shared library
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR) @echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), FreeBSD) ifeq ($(OSNAME), FreeBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), NetBSD) ifeq ($(OSNAME), NetBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) -install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dylib -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
endif endif
@echo Install OK! @echo Install OK!

View File

@ -3,7 +3,12 @@
# #
# This library's version # This library's version
VERSION = 0.1alpha2.5 VERSION = 0.1.0
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
# is libopenblas_$(LIBNAMESUFFIX).so.0.
# LIBNAMESUFFIX = omp
# You can specify the target architecture, otherwise it's # You can specify the target architecture, otherwise it's
# automatically detected. # automatically detected.
@ -83,6 +88,11 @@ VERSION = 0.1alpha2.5
# If you need to synchronize FP CSR between threads (for x86/x86_64 only). # If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1 # CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very # If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet). # slow (Not implemented yet).
# SANITY_CHECK = 1 # SANITY_CHECK = 1

View File

@ -40,6 +40,11 @@ ifdef INTERFACE64
GETARCH_FLAGS += -DUSE64BITINT GETARCH_FLAGS += -DUSE64BITINT
endif endif
ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4
endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
# This operation is expensive, so execution should be once. # This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1
@ -274,7 +279,12 @@ endif
BINARY_DEFINED = 1 BINARY_DEFINED = 1
endif endif
ifeq ($(CORE), LOONGSON3A) ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -march=mips64 CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64
endif endif
@ -341,7 +351,8 @@ endif
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall FCOMMON_OPT += -Wall
EXTRALIB += -lgfortran
ifdef NO_BINARY_MODE ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64) ifeq ($(ARCH), mips64)
ifdef BINARY64 ifdef BINARY64
@ -528,8 +539,10 @@ ifdef SMP
CCOMMON_OPT += -DSMP_SERVER CCOMMON_OPT += -DSMP_SERVER
ifeq ($(ARCH), mips64) ifeq ($(ARCH), mips64)
ifneq ($(CORE), LOONGSON3B)
USE_SIMPLE_THREADED_LEVEL3 = 1 USE_SIMPLE_THREADED_LEVEL3 = 1
endif endif
endif
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
# USE_SIMPLE_THREADED_LEVEL3 = 1 # USE_SIMPLE_THREADED_LEVEL3 = 1
@ -568,7 +581,11 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif endif
ifndef LIBNAMESUFFIX
LIBPREFIX = libopenblas LIBPREFIX = libopenblas
else
LIBPREFIX = libopenblas_$(LIBNAMESUFFIX)
endif
KERNELDIR = $(TOPDIR)/kernel/$(ARCH) KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
@ -590,9 +607,11 @@ endif
ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86_64)
ifneq ($(ARCH), x86) ifneq ($(ARCH), x86)
ifneq ($(CORE), LOONGSON3B)
NO_AFFINITY = 1 NO_AFFINITY = 1
endif endif
endif endif
endif
ifdef NO_AFFINITY ifdef NO_AFFINITY
CCOMMON_OPT += -DNO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY
@ -636,6 +655,7 @@ MD5SUM = md5sum
AWK = awk AWK = awk
REVISION = -r$(VERSION) REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)

1
README
View File

@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
9.Known Issues 9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32. is 64. On 32 bits, it is 32.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
10. Specification of Git Branches 10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).

View File

@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...);
static inline int my_mbind(void *addr, unsigned long len, int mode, static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode, unsigned long *nodemask, unsigned long maxnode,
unsigned flags) { unsigned flags) {
#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#else
return 0; //NULL Implementation on Loongson 3B 32bit.
#endif
#else
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
unsigned long null_nodemask=0; // unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#endif
} }
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {

View File

@ -2127,7 +2127,9 @@
#endif #endif
#ifndef ASSEMBLER #ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sgemm_p; extern BLASLONG sgemm_p;
extern BLASLONG sgemm_q; extern BLASLONG sgemm_q;
extern BLASLONG sgemm_r; extern BLASLONG sgemm_r;

View File

@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){
static inline unsigned int rpcc(void){ static inline unsigned int rpcc(void){
unsigned long ret; unsigned long ret;
#if defined(LOONGSON3A) #if defined(LOONGSON3A) || defined(LOONGSON3B)
unsigned long long tmp; // unsigned long long tmp;
__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
ret=tmp; //ret=tmp;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
#else #else
__asm__ __volatile__(".set push \n" __asm__ __volatile__(".set push \n"
".set mips32r2\n" ".set mips32r2\n"
@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){
return ret; return ret;
} }
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#ifndef NO_AFFINITY
#define WHEREAMI
static inline int WhereAmI(void){
int ret=0;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $0\n"
".set pop": "=r"(ret):: "memory");
return ret;
}
#endif
#endif
static inline int blas_quickdivide(blasint x, blasint y){ static inline int blas_quickdivide(blasint x, blasint y){
return x / y; return x / y;
} }
@ -152,6 +172,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define CMPEQ c.eq.d #define CMPEQ c.eq.d
#define CMPLE c.le.d #define CMPLE c.le.d
#define CMPLT c.lt.d #define CMPLT c.lt.d
#define NEG neg.d
#else #else
#define LD lwc1 #define LD lwc1
#define ST swc1 #define ST swc1
@ -170,6 +191,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define CMPEQ c.eq.s #define CMPEQ c.eq.s
#define CMPLE c.le.s #define CMPLE c.le.s
#define CMPLT c.lt.s #define CMPLT c.lt.s
#define PLU plu.ps
#define PLL pll.ps
#define PUU puu.ps
#define PUL pul.ps
#define MADPS madd.ps
#define CVTU cvt.s.pu
#define CVTL cvt.s.pl
#define NEG neg.s
#endif #endif
#if defined(__64BIT__) && defined(USE64BITINT) #if defined(__64BIT__) && defined(USE64BITINT)
@ -218,13 +247,18 @@ REALNAME: ;\
#define SEEK_ADDRESS #define SEEK_ADDRESS
#define BUFFER_SIZE ( 8 << 20) #define BUFFER_SIZE ( 32 << 20)
#if defined(LOONGSON3A) #if defined(LOONGSON3A)
#define PAGESIZE (16UL << 10) #define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10)
#endif #endif
#if defined(LOONGSON3B)
#define PAGESIZE (32UL << 10)
#define FIXED_PAGESIZE (32UL << 10)
#endif
#ifndef PAGESIZE #ifndef PAGESIZE
#define PAGESIZE (64UL << 10) #define PAGESIZE (64UL << 10)
#endif #endif
@ -236,7 +270,7 @@ REALNAME: ;\
#define MAP_ANONYMOUS MAP_ANON #define MAP_ANONYMOUS MAP_ANON
#endif #endif
#if defined(LOONGSON3A) #if defined(LOONGSON3A) || defined(LOONGSON3B)
#define PREFETCHD_(x) ld $0, x #define PREFETCHD_(x) ld $0, x
#define PREFETCHD(x) PREFETCHD_(x) #define PREFETCHD(x) PREFETCHD_(x)
#else #else

View File

@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_SICORTEX 1 #define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2 #define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKOWN",
"SICORTEX", "SICORTEX",
"LOONGSON3A" "LOONGSON3A",
"LOONGSON3B"
}; };
int detect(void){ int detect(void){
@ -101,6 +103,8 @@ int detect(void){
if (strstr(p, "Loongson-3A")){ if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A; return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){ }else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r"); infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){ while (fgets(buffer, sizeof(buffer), infile)){
@ -130,6 +134,8 @@ void get_architecture(void){
void get_subarchitecture(void){ void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) { if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A"); printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
}else{ }else{
printf("SICORTEX"); printf("SICORTEX");
} }
@ -149,6 +155,15 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n"); printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{ }else{
printf("#define SICORTEX\n"); printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
@ -164,6 +179,8 @@ void get_cpuconfig(void){
void get_libname(void){ void get_libname(void){
if(detect()==CPU_LOONGSON3A) { if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n"); printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
}else{ }else{
#ifdef __mips64 #ifdef __mips64
printf("mips64\n"); printf("mips64\n");

View File

@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
range_M[0] = 0; range_M[0] = 0;
i = arg -> m; i = arg -> m;
} else { } else {
range_M[0] = range_M[0]; range_M[0] = range_m[0];
i = range_M[1] - range_M[0]; i = range_m[1] - range_m[0];
} }
num_cpu_m = 0; num_cpu_m = 0;

View File

@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].args = arg; queue[num_cpu].args = arg;
queue[num_cpu].range_m = range_m; queue[num_cpu].range_m = range_m;
queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].range_n = &range[num_cpu];
queue[num_cpu].sa = NULL; #if defined(LOONGSON3A)
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL; queue[num_cpu].sb = NULL;
#endif
queue[num_cpu].next = &queue[num_cpu + 1]; queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++; num_cpu ++;
} }
if (num_cpu) { if (num_cpu) {
#if defined(LOONGSON3A)
queue[0].sa = sa; queue[0].sa = sa;
queue[0].sb = sb; queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else
queue[0].sa = sa;
queue[0].sb = sb;
#endif
queue[num_cpu - 1].next = NULL; queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, exec_blas(num_cpu,

View File

@ -55,8 +55,8 @@ int CNAME(int mode,
range_M[0] = 0; range_M[0] = 0;
i = arg -> m; i = arg -> m;
} else { } else {
range_M[0] = range_M[0]; range_M[0] = range_m[0];
i = range_M[1] - range_M[0]; i = range_m[1] - range_m[0];
} }
num_cpu_m = 0; num_cpu_m = 0;

View File

@ -500,6 +500,7 @@ static int blas_monitor(void *arg){
/* Initializing routine */ /* Initializing routine */
int blas_thread_init(void){ int blas_thread_init(void){
BLASLONG i; BLASLONG i;
int ret;
#ifdef NEED_STACKATTR #ifdef NEED_STACKATTR
pthread_attr_t attr; pthread_attr_t attr;
#endif #endif
@ -545,12 +546,16 @@ int blas_thread_init(void){
pthread_cond_init (&thread_status[i].wakeup, NULL); pthread_cond_init (&thread_status[i].wakeup, NULL);
#ifdef NEED_STACKATTR #ifdef NEED_STACKATTR
pthread_create(&blas_threads[i], &attr, ret=pthread_create(&blas_threads[i], &attr,
(void *)&blas_thread_server, (void *)i); (void *)&blas_thread_server, (void *)i);
#else #else
pthread_create(&blas_threads[i], NULL, ret=pthread_create(&blas_threads[i], NULL,
(void *)&blas_thread_server, (void *)i); (void *)&blas_thread_server, (void *)i);
#endif #endif
if(ret!=0){
fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
exit(1);
}
} }
#ifdef MONITOR #ifdef MONITOR
@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads; blas_cpu_number = num_threads;
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
#endif
} }
void openblas_set_num_threads(int num_threads) { void openblas_set_num_threads(int num_threads) {

View File

@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number); omp_set_num_threads(blas_cpu_number);
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
#endif
} }
void openblas_set_num_threads(int num_threads) { void openblas_set_num_threads(int num_threads) {

View File

@ -390,11 +390,11 @@ static void *alloc_mmap(void *address){
#ifdef OS_LINUX #ifdef OS_LINUX
#ifdef DEBUG #ifdef DEBUG
int ret; int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){ if(ret==-1){
int errsv=errno; int errsv=errno;
perror("alloc_mmap:"); perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
} }
@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif #endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#ifndef DYNAMIC_ARCH #ifndef DYNAMIC_ARCH
blas_set_parameter(); blas_set_parameter();
#endif #endif

View File

@ -45,8 +45,22 @@ int get_L2_size(void);
#define DEFAULT_GEMM_P 128 #define DEFAULT_GEMM_P 128
#define DEFAULT_GEMM_Q 128 #define DEFAULT_GEMM_Q 128
#define DEFAULT_GEMM_R 128 #define DEFAULT_GEMM_R 128
#define DEFAULT_GEMM_OFFSET_A 0
#define DEFAULT_GEMM_OFFSET_B 0
/* Global Parameter */ /* Global Parameter */
#if GEMM_OFFSET_A == gemm_offset_a
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
#else
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
#endif
#if GEMM_OFFSET_B == gemm_offset_b
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
#else
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
#endif
#if SGEMM_P == sgemm_p #if SGEMM_P == sgemm_p
BLASLONG sgemm_p = DEFAULT_GEMM_P; BLASLONG sgemm_p = DEFAULT_GEMM_P;
#else #else
@ -666,3 +680,36 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif
#if defined(ARCH_MIPS64)
void blas_set_parameter(void){
#if defined(LOONGSON3A)
#ifdef SMP
if(blas_num_threads == 1){
#endif
//single thread
dgemm_r = 1024;
#ifdef SMP
}else{
//multi thread
dgemm_r = 200;
}
#endif
#endif
#if defined(LOONGSON3B)
#ifdef SMP
if(blas_num_threads == 1 || blas_num_threads == 2){
#endif
//single thread
dgemm_r = 640;
#ifdef SMP
}else{
//multi thread
dgemm_r = 160;
}
#endif
#endif
}
#endif

View File

@ -58,16 +58,16 @@ dll : ../$(LIBDLLNAME)
dll2 : libgoto2_shared.dll dll2 : libgoto2_shared.dll
../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME) $(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1) ifeq ($(BINARY32), 1)
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:i386 /def:libgoto2.def -lib /machine:i386 /def:libopenblas.def
else else
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:X64 /def:libgoto2.def -lib /machine:X64 /def:libopenblas.def
endif endif
libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
@ -75,7 +75,7 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
libgoto2.def : gensymbol libopenblas.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
libgoto2_shared.def : gensymbol libgoto2_shared.def : gensymbol
@ -100,7 +100,7 @@ so : ../$(LIBSONAME)
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def $(EXTRALIB) -Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest rm -f linktest

View File

@ -301,7 +301,7 @@
if ($ARGV[5] == 1) { if ($ARGV[5] == 1) {
#NO_LAPACK=1 #NO_LAPACK=1
@objs = (@blasobjs); @objs = (@blasobjs);
} elsif (-d "../lapack-3.1.1") { } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") {
@objs = (@blasobjs, @lapackobjs, @lapackobjs2); @objs = (@blasobjs, @lapackobjs, @lapackobjs2);
} else { } else {
@objs = (@blasobjs, @lapackobjs); @objs = (@blasobjs, @lapackobjs);
@ -389,6 +389,13 @@ if ($ARGV[0] eq "win2k"){
$count ++; $count ++;
} }
if ($ARGV[4] == 0) {
foreach $objs (@cblasobjs) {
print "\t",$objs,"=$objs"," \@", $count, "\n";
$count ++;
}
}
exit(0); exit(0);
} }

View File

@ -284,6 +284,10 @@ if ($link ne "") {
} }
if ($vendor eq "INTEL"){
$linker_a .= "-lgfortran"
}
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
open(CONFFILE, ">> $config" ) || die "Can't append $config"; open(CONFFILE, ">> $config" ) || die "Can't append $config";

View File

@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_CELL */ /* #define FORCE_CELL */
/* #define FORCE_SICORTEX */ /* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */ /* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */ /* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */ /* #define FORCE_GENERIC */
/* #define FORCE_SPARC */ /* #define FORCE_SPARC */
@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#endif #endif
#ifdef FORCE_LOONGSON3B
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "LOONGSON3B"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DLOONGSON3B " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "loongson3b"
#define CORENAME "LOONGSON3B"
#else
#endif
#ifdef FORCE_ITANIUM2 #ifdef FORCE_ITANIUM2
#define FORCE #define FORCE
#define ARCHITECTURE "IA64" #define ARCHITECTURE "IA64"

View File

@ -34,6 +34,7 @@ int main(int argc, char **argv) {
#ifdef USE64BITINT #ifdef USE64BITINT
printf("#define USE64BITINT\n"); printf("#define USE64BITINT\n");
#endif #endif
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
} }
return 0; return 0;

View File

@ -770,20 +770,36 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c
xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
ifndef USE_NETLIB_GEMV
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
$(CC) -c $(CFLAGS) -o $(@F) $< $(CC) -c $(CFLAGS) -o $(@F) $<
dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c
$(CC) -c $(CFLAGS) -o $(@F) $< $(CC) -c $(CFLAGS) -o $(@F) $<
else
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f
$(FC) -c $(FFLAGS) -o $(@F) $<
dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f
$(FC) -c $(FFLAGS) -o $(@F) $<
endif
qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c
$(CC) -c $(CFLAGS) -o $(@F) $< $(CC) -c $(CFLAGS) -o $(@F) $<
ifndef USE_NETLIB_GEMV
cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c
$(CC) -c $(CFLAGS) -o $(@F) $< $(CC) -c $(CFLAGS) -o $(@F) $<
zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c
$(CC) -c $(CFLAGS) -o $(@F) $< $(CC) -c $(CFLAGS) -o $(@F) $<
else
cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f
$(FC) -c $(FFLAGS) -o $(@F) $<
zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f
$(FC) -c $(FFLAGS) -o $(@F) $<
endif
xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c
$(CC) -c $(CFLAGS) -o $(@F) $< $(CC) -c $(CFLAGS) -o $(@F) $<

View File

@ -397,8 +397,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transb << BLAS_TRANSB_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT);
args.common = NULL; args.common = NULL;
args.nthreads = num_cpu_avail(3);
if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
args.nthreads = 1;
}else{
args.nthreads = num_cpu_avail(3);
}
if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif

285
interface/netlib/cgemv.f Normal file
View File

@ -0,0 +1,285 @@
SUBROUTINE CGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
COMPLEX ALPHA,BETA
INTEGER INCX,INCY,LDA,M,N
CHARACTER TRANS
* ..
* .. Array Arguments ..
COMPLEX A(LDA,*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* CGEMV performs one of the matrix-vector operations
*
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or
*
* y := alpha*A**H*x + beta*y,
*
* where alpha and beta are scalars, x and y are vectors and A is an
* m by n matrix.
*
* Arguments
* ==========
*
* TRANS - CHARACTER*1.
* On entry, TRANS specifies the operation to be performed as
* follows:
*
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
*
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
*
* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y.
*
* Unchanged on exit.
*
* M - INTEGER.
* On entry, M specifies the number of rows of the matrix A.
* M must be at least zero.
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the number of columns of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* ALPHA - COMPLEX .
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* A - COMPLEX array of DIMENSION ( LDA, n ).
* Before entry, the leading m by n part of the array A must
* contain the matrix of coefficients.
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* max( 1, m ).
* Unchanged on exit.
*
* X - COMPLEX array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
* Before entry, the incremented array X must contain the
* vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - COMPLEX .
* On entry, BETA specifies the scalar beta. When BETA is
* supplied as zero then Y need not be set on input.
* Unchanged on exit.
*
* Y - COMPLEX array of DIMENSION at least
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
* Before entry with BETA non-zero, the incremented array Y
* must contain the vector y. On exit, Y is overwritten by the
* updated vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
* The vector and matrix arguments are not referenced when N = 0, or M = 0
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
COMPLEX ONE
PARAMETER (ONE= (1.0E+0,0.0E+0))
COMPLEX ZERO
PARAMETER (ZERO= (0.0E+0,0.0E+0))
* ..
* .. Local Scalars ..
COMPLEX TEMP
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
LOGICAL NOCONJ
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC CONJG,MAX
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
+ .NOT.LSAME(TRANS,'C')) THEN
INFO = 1
ELSE IF (M.LT.0) THEN
INFO = 2
ELSE IF (N.LT.0) THEN
INFO = 3
ELSE IF (LDA.LT.MAX(1,M)) THEN
INFO = 6
ELSE IF (INCX.EQ.0) THEN
INFO = 8
ELSE IF (INCY.EQ.0) THEN
INFO = 11
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('CGEMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
NOCONJ = LSAME(TRANS,'T')
*
* Set LENX and LENY, the lengths of the vectors x and y, and set
* up the start points in X and Y.
*
IF (LSAME(TRANS,'N')) THEN
LENX = N
LENY = M
ELSE
LENX = M
LENY = N
END IF
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (LENX-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (LENY-1)*INCY
END IF
*
* Start the operations. In this version the elements of A are
* accessed sequentially with one pass through A.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,LENY
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,LENY
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,LENY
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,LENY
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
IF (LSAME(TRANS,'N')) THEN
*
* Form y := alpha*A*x + y.
*
JX = KX
IF (INCY.EQ.1) THEN
DO 60 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
DO 50 I = 1,M
Y(I) = Y(I) + TEMP*A(I,J)
50 CONTINUE
END IF
JX = JX + INCX
60 CONTINUE
ELSE
DO 80 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
IY = KY
DO 70 I = 1,M
Y(IY) = Y(IY) + TEMP*A(I,J)
IY = IY + INCY
70 CONTINUE
END IF
JX = JX + INCX
80 CONTINUE
END IF
ELSE
*
* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y.
*
JY = KY
IF (INCX.EQ.1) THEN
DO 110 J = 1,N
TEMP = ZERO
IF (NOCONJ) THEN
DO 90 I = 1,M
TEMP = TEMP + A(I,J)*X(I)
90 CONTINUE
ELSE
DO 100 I = 1,M
TEMP = TEMP + CONJG(A(I,J))*X(I)
100 CONTINUE
END IF
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
110 CONTINUE
ELSE
DO 140 J = 1,N
TEMP = ZERO
IX = KX
IF (NOCONJ) THEN
DO 120 I = 1,M
TEMP = TEMP + A(I,J)*X(IX)
IX = IX + INCX
120 CONTINUE
ELSE
DO 130 I = 1,M
TEMP = TEMP + CONJG(A(I,J))*X(IX)
IX = IX + INCX
130 CONTINUE
END IF
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
140 CONTINUE
END IF
END IF
*
RETURN
*
* End of CGEMV .
*
END

265
interface/netlib/dgemv.f Normal file
View File

@ -0,0 +1,265 @@
SUBROUTINE DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
DOUBLE PRECISION ALPHA,BETA
INTEGER INCX,INCY,LDA,M,N
CHARACTER TRANS
* ..
* .. Array Arguments ..
DOUBLE PRECISION A(LDA,*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* DGEMV performs one of the matrix-vector operations
*
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
*
* where alpha and beta are scalars, x and y are vectors and A is an
* m by n matrix.
*
* Arguments
* ==========
*
* TRANS - CHARACTER*1.
* On entry, TRANS specifies the operation to be performed as
* follows:
*
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
*
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
*
* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y.
*
* Unchanged on exit.
*
* M - INTEGER.
* On entry, M specifies the number of rows of the matrix A.
* M must be at least zero.
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the number of columns of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* ALPHA - DOUBLE PRECISION.
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
* Before entry, the leading m by n part of the array A must
* contain the matrix of coefficients.
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* max( 1, m ).
* Unchanged on exit.
*
* X - DOUBLE PRECISION array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
* Before entry, the incremented array X must contain the
* vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - DOUBLE PRECISION.
* On entry, BETA specifies the scalar beta. When BETA is
* supplied as zero then Y need not be set on input.
* Unchanged on exit.
*
* Y - DOUBLE PRECISION array of DIMENSION at least
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
* Before entry with BETA non-zero, the incremented array Y
* must contain the vector y. On exit, Y is overwritten by the
* updated vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
* The vector and matrix arguments are not referenced when N = 0, or M = 0
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
DOUBLE PRECISION ONE,ZERO
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
* ..
* .. Local Scalars ..
DOUBLE PRECISION TEMP
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
+ .NOT.LSAME(TRANS,'C')) THEN
INFO = 1
ELSE IF (M.LT.0) THEN
INFO = 2
ELSE IF (N.LT.0) THEN
INFO = 3
ELSE IF (LDA.LT.MAX(1,M)) THEN
INFO = 6
ELSE IF (INCX.EQ.0) THEN
INFO = 8
ELSE IF (INCY.EQ.0) THEN
INFO = 11
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('DGEMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
* Set LENX and LENY, the lengths of the vectors x and y, and set
* up the start points in X and Y.
*
IF (LSAME(TRANS,'N')) THEN
LENX = N
LENY = M
ELSE
LENX = M
LENY = N
END IF
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (LENX-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (LENY-1)*INCY
END IF
*
* Start the operations. In this version the elements of A are
* accessed sequentially with one pass through A.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,LENY
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,LENY
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,LENY
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,LENY
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
IF (LSAME(TRANS,'N')) THEN
*
* Form y := alpha*A*x + y.
*
JX = KX
IF (INCY.EQ.1) THEN
DO 60 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
DO 50 I = 1,M
Y(I) = Y(I) + TEMP*A(I,J)
50 CONTINUE
END IF
JX = JX + INCX
60 CONTINUE
ELSE
DO 80 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
IY = KY
DO 70 I = 1,M
Y(IY) = Y(IY) + TEMP*A(I,J)
IY = IY + INCY
70 CONTINUE
END IF
JX = JX + INCX
80 CONTINUE
END IF
ELSE
*
* Form y := alpha*A**T*x + y.
*
JY = KY
IF (INCX.EQ.1) THEN
DO 100 J = 1,N
TEMP = ZERO
DO 90 I = 1,M
TEMP = TEMP + A(I,J)*X(I)
90 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
100 CONTINUE
ELSE
DO 120 J = 1,N
TEMP = ZERO
IX = KX
DO 110 I = 1,M
TEMP = TEMP + A(I,J)*X(IX)
IX = IX + INCX
110 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
120 CONTINUE
END IF
END IF
*
RETURN
*
* End of DGEMV .
*
END

265
interface/netlib/sgemv.f Normal file
View File

@ -0,0 +1,265 @@
SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
REAL ALPHA,BETA
INTEGER INCX,INCY,LDA,M,N
CHARACTER TRANS
* ..
* .. Array Arguments ..
REAL A(LDA,*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* SGEMV performs one of the matrix-vector operations
*
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
*
* where alpha and beta are scalars, x and y are vectors and A is an
* m by n matrix.
*
* Arguments
* ==========
*
* TRANS - CHARACTER*1.
* On entry, TRANS specifies the operation to be performed as
* follows:
*
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
*
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
*
* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y.
*
* Unchanged on exit.
*
* M - INTEGER.
* On entry, M specifies the number of rows of the matrix A.
* M must be at least zero.
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the number of columns of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* ALPHA - REAL .
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* A - REAL array of DIMENSION ( LDA, n ).
* Before entry, the leading m by n part of the array A must
* contain the matrix of coefficients.
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* max( 1, m ).
* Unchanged on exit.
*
* X - REAL array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
* Before entry, the incremented array X must contain the
* vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - REAL .
* On entry, BETA specifies the scalar beta. When BETA is
* supplied as zero then Y need not be set on input.
* Unchanged on exit.
*
* Y - REAL array of DIMENSION at least
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
* Before entry with BETA non-zero, the incremented array Y
* must contain the vector y. On exit, Y is overwritten by the
* updated vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
* The vector and matrix arguments are not referenced when N = 0, or M = 0
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
REAL ONE,ZERO
PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
* ..
* .. Local Scalars ..
REAL TEMP
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
+ .NOT.LSAME(TRANS,'C')) THEN
INFO = 1
ELSE IF (M.LT.0) THEN
INFO = 2
ELSE IF (N.LT.0) THEN
INFO = 3
ELSE IF (LDA.LT.MAX(1,M)) THEN
INFO = 6
ELSE IF (INCX.EQ.0) THEN
INFO = 8
ELSE IF (INCY.EQ.0) THEN
INFO = 11
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('SGEMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
* Set LENX and LENY, the lengths of the vectors x and y, and set
* up the start points in X and Y.
*
IF (LSAME(TRANS,'N')) THEN
LENX = N
LENY = M
ELSE
LENX = M
LENY = N
END IF
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (LENX-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (LENY-1)*INCY
END IF
*
* Start the operations. In this version the elements of A are
* accessed sequentially with one pass through A.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,LENY
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,LENY
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,LENY
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,LENY
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
IF (LSAME(TRANS,'N')) THEN
*
* Form y := alpha*A*x + y.
*
JX = KX
IF (INCY.EQ.1) THEN
DO 60 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
DO 50 I = 1,M
Y(I) = Y(I) + TEMP*A(I,J)
50 CONTINUE
END IF
JX = JX + INCX
60 CONTINUE
ELSE
DO 80 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
IY = KY
DO 70 I = 1,M
Y(IY) = Y(IY) + TEMP*A(I,J)
IY = IY + INCY
70 CONTINUE
END IF
JX = JX + INCX
80 CONTINUE
END IF
ELSE
*
* Form y := alpha*A**T*x + y.
*
JY = KY
IF (INCX.EQ.1) THEN
DO 100 J = 1,N
TEMP = ZERO
DO 90 I = 1,M
TEMP = TEMP + A(I,J)*X(I)
90 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
100 CONTINUE
ELSE
DO 120 J = 1,N
TEMP = ZERO
IX = KX
DO 110 I = 1,M
TEMP = TEMP + A(I,J)*X(IX)
IX = IX + INCX
110 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
120 CONTINUE
END IF
END IF
*
RETURN
*
* End of SGEMV .
*
END

285
interface/netlib/zgemv.f Normal file
View File

@ -0,0 +1,285 @@
SUBROUTINE ZGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
DOUBLE COMPLEX ALPHA,BETA
INTEGER INCX,INCY,LDA,M,N
CHARACTER TRANS
* ..
* .. Array Arguments ..
DOUBLE COMPLEX A(LDA,*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* ZGEMV performs one of the matrix-vector operations
*
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or
*
* y := alpha*A**H*x + beta*y,
*
* where alpha and beta are scalars, x and y are vectors and A is an
* m by n matrix.
*
* Arguments
* ==========
*
* TRANS - CHARACTER*1.
* On entry, TRANS specifies the operation to be performed as
* follows:
*
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
*
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
*
* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y.
*
* Unchanged on exit.
*
* M - INTEGER.
* On entry, M specifies the number of rows of the matrix A.
* M must be at least zero.
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the number of columns of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* ALPHA - COMPLEX*16 .
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* A - COMPLEX*16 array of DIMENSION ( LDA, n ).
* Before entry, the leading m by n part of the array A must
* contain the matrix of coefficients.
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* max( 1, m ).
* Unchanged on exit.
*
* X - COMPLEX*16 array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
* Before entry, the incremented array X must contain the
* vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - COMPLEX*16 .
* On entry, BETA specifies the scalar beta. When BETA is
* supplied as zero then Y need not be set on input.
* Unchanged on exit.
*
* Y - COMPLEX*16 array of DIMENSION at least
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
* and at least
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
* Before entry with BETA non-zero, the incremented array Y
* must contain the vector y. On exit, Y is overwritten by the
* updated vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
* The vector and matrix arguments are not referenced when N = 0, or M = 0
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
DOUBLE COMPLEX ONE
PARAMETER (ONE= (1.0D+0,0.0D+0))
DOUBLE COMPLEX ZERO
PARAMETER (ZERO= (0.0D+0,0.0D+0))
* ..
* .. Local Scalars ..
DOUBLE COMPLEX TEMP
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
LOGICAL NOCONJ
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC DCONJG,MAX
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
+ .NOT.LSAME(TRANS,'C')) THEN
INFO = 1
ELSE IF (M.LT.0) THEN
INFO = 2
ELSE IF (N.LT.0) THEN
INFO = 3
ELSE IF (LDA.LT.MAX(1,M)) THEN
INFO = 6
ELSE IF (INCX.EQ.0) THEN
INFO = 8
ELSE IF (INCY.EQ.0) THEN
INFO = 11
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('ZGEMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
NOCONJ = LSAME(TRANS,'T')
*
* Set LENX and LENY, the lengths of the vectors x and y, and set
* up the start points in X and Y.
*
IF (LSAME(TRANS,'N')) THEN
LENX = N
LENY = M
ELSE
LENX = M
LENY = N
END IF
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (LENX-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (LENY-1)*INCY
END IF
*
* Start the operations. In this version the elements of A are
* accessed sequentially with one pass through A.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,LENY
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,LENY
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,LENY
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,LENY
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
IF (LSAME(TRANS,'N')) THEN
*
* Form y := alpha*A*x + y.
*
JX = KX
IF (INCY.EQ.1) THEN
DO 60 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
DO 50 I = 1,M
Y(I) = Y(I) + TEMP*A(I,J)
50 CONTINUE
END IF
JX = JX + INCX
60 CONTINUE
ELSE
DO 80 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = ALPHA*X(JX)
IY = KY
DO 70 I = 1,M
Y(IY) = Y(IY) + TEMP*A(I,J)
IY = IY + INCY
70 CONTINUE
END IF
JX = JX + INCX
80 CONTINUE
END IF
ELSE
*
* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y.
*
JY = KY
IF (INCX.EQ.1) THEN
DO 110 J = 1,N
TEMP = ZERO
IF (NOCONJ) THEN
DO 90 I = 1,M
TEMP = TEMP + A(I,J)*X(I)
90 CONTINUE
ELSE
DO 100 I = 1,M
TEMP = TEMP + DCONJG(A(I,J))*X(I)
100 CONTINUE
END IF
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
110 CONTINUE
ELSE
DO 140 J = 1,N
TEMP = ZERO
IX = KX
IF (NOCONJ) THEN
DO 120 I = 1,M
TEMP = TEMP + A(I,J)*X(IX)
IX = IX + INCX
120 CONTINUE
ELSE
DO 130 I = 1,M
TEMP = TEMP + DCONJG(A(I,J))*X(IX)
IX = IX + INCX
130 CONTINUE
END IF
Y(JY) = Y(JY) + ALPHA*TEMP
JY = JY + INCY
140 CONTINUE
END IF
END IF
*
RETURN
*
* End of ZGEMV .
*
END

View File

@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO,
FLOAT *sa, *sb; FLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL; int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE) #elif defined(DOUBLE)
@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO,
#else #else
int mode = BLAS_SINGLE | BLAS_REAL; int mode = BLAS_SINGLE | BLAS_REAL;
#endif #endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif #endif
#if defined(SMP) && !defined(NO_AFFINITY) #if defined(SMP) && !defined(NO_AFFINITY)
@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
FLOAT *sa, *sb; FLOAT *sa, *sb;
#ifdef SMP #ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL; int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE) #elif defined(DOUBLE)
@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
#else #else
int mode = BLAS_SINGLE | BLAS_REAL; int mode = BLAS_SINGLE | BLAS_REAL;
#endif #endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif #endif
#if defined(SMP) && !defined(NO_AFFINITY) #if defined(SMP) && !defined(NO_AFFINITY)

View File

@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
ifeq ($(TARGET), LOONGSON3B)
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@

View File

@ -0,0 +1,157 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(bk&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
res2 = res2*alpha;
C1[0] = C1[0]+res2;
res3 = res3*alpha;
C1[1] = C1[1]+res3;
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C1[0] = C1[0]+res1;
C0 = C0+1;
C1 = C1+1;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
C0 = C0+1;
}
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,280 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+2;
#endif
for (k=0; k<temp/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(temp&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
res2 = res2*alpha;
C1[0] = res2;
res3 = res3*alpha;
C1[1] = res3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C1[0] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off;
#endif
res0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp;
ptrbb += temp;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,838 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
for (k=0; k<bk/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(bk&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
load4 = res4*alphar;
C1[0] = C1[0]+load4;
load5 = res5*alphar;
C1[1] = C1[1]+load5;
load4 = res5*alphai;
C1[0] = C1[0]-load4;
load5 = res4*alphai;
C1[1] = C1[1]+load5;
load6 = res6*alphar;
C1[2] = C1[2]+load6;
load7 = res7*alphar;
C1[3] = C1[3]+load7;
load6 = res7*alphai;
C1[2] = C1[2]-load6;
load7 = res6*alphai;
C1[3] = C1[3]+load7;
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C1[0] = C1[0]+load2;
load3 = res3*alphar;
C1[1] = C1[1]+load3;
load2 = res3*alphai;
C1[0] = C1[0]-load2;
load3 = res2*alphai;
C1[1] = C1[1]+load3;
C0 = C0+2;
C1 = C1+2;
}
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -0,0 +1,923 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
FLOAT* C,BLASLONG ldc, BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 2;
#endif
for (k=0; k<temp/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(temp&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
load4 = res4*alphar-res5*alphai;
load5 = res5*alphar+res4*alphai;
C1[0] = load4;
C1[1] = load5;
load6 = res6*alphar-res7*alphai;
load7 = res7*alphar+res6*alphai;
C1[2] = load6;
C1[3] = load7;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C1[0] = load2;
C1[1] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
C1 = C1+2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S DTRSMKERNEL_RT = trsm_kernel_RT.S
endif endif
ifndef CTRSMKERNEL_LN
CTRSMKERNEL_LN = ztrsm_kernel_LT.S CTRSMKERNEL_LN = ztrsm_kernel_LT.S
CTRSMKERNEL_LT = ztrsm_kernel_LT.S endif
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
ifndef CTRSMKERNEL_LT
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RN
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
CGEMM3MKERNEL = zgemm3m_kernel.S CGEMM3MKERNEL = zgemm3m_kernel.S
ZGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S

View File

@ -1,18 +1,48 @@
SAXPYKERNEL=axpy_loongson3a.S SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMMKERNEL = sgemm_kernel_loongson3a.S SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMVTKERNEL = gemv_t_loongson3a.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = gemm_kernel_loongson3a.S DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -0,0 +1,64 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,101 @@
#include "common.h"
//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
{
BLASLONG kx=0, ky=0;
if(!ALPHA)
return 0;
//if(INCX < 0)
// kx = (1-N) * INCX;
// INCX = -INCX;
//if(INCY < 0)
// ky = (1-M) * INCY;
// INCY = -INCY;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 4;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0;
if(ALPHA == 1) {
if(INCY == 1) {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[i + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M);) {
spec_loop_alpha1;
}
}
} else {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0, h = ky;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[h + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M);) {
norm_loop_alpha1;
}
}
}
} else {
if(INCY == 1) {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[i + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M);) {
spec_loop;
}
}
} else {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0, h = ky;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[h + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M);) {
norm_loop;
}
}
}
}
return 0;
}

View File

@ -0,0 +1,93 @@
#include "common.h"
//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!ALPHA)
return 0;
// if(INCX < 0)
// INCX = -INCX;
// if(INCY < 0)
// INCY = -INCY;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 3;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0;
if(ALPHA == 1) {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[i + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M);) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0, h = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[h + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M);) {
norm_loop_alpha1;
}
}
}
} else {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[i + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M);) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0, h = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[h + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M);) {
norm_loop;
}
}
}
}
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,139 @@
#include "common.h"
//typedef int BLASLONG;
//typedef double FLOAT;
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#if !defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_0
#define spec_loop spec_loop_0
#define norm_loop_alpha1 norm_loop_alpha1_0
#define norm_loop norm_loop_0
#endif
#if defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_1
#define spec_loop spec_loop_1
#define norm_loop_alpha1 norm_loop_alpha1_1
#define norm_loop norm_loop_1
#endif
#if !defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_2
#define spec_loop spec_loop_2
#define norm_loop_alpha1 norm_loop_alpha1_2
#define norm_loop norm_loop_2
#endif
#if defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_3
#define spec_loop spec_loop_3
#define norm_loop_alpha1 norm_loop_alpha1_3
#define norm_loop norm_loop_3
#endif
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!rALPHA && iALPHA)
return 0;
BLASLONG fahead = 60;
BLASLONG spec_unroll = 2;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0, jj = 0;
if(rALPHA == 1 && iALPHA == 0) {
if(INCY == 1) {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[ii + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M); i++) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[iii + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M); i++) {
norm_loop_alpha1;
}
}
}
} else {
FLOAT rTmp, iTmp;
if(INCY == 1) {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[ii + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M); i++) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[iii + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M); i++) {
norm_loop;
}
}
}
}
return 0;
}

View File

@ -0,0 +1,125 @@
#include "common.h"
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#if !defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_0
#define spec_loop spec_loop_0
#define norm_loop_alpha1 norm_loop_alpha1_0
#define norm_loop norm_loop_0
#endif
#if defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_1
#define spec_loop spec_loop_1
#define norm_loop_alpha1 norm_loop_alpha1_1
#define norm_loop norm_loop_1
#endif
#if !defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_2
#define spec_loop spec_loop_2
#define norm_loop_alpha1 norm_loop_alpha1_2
#define norm_loop norm_loop_2
#endif
#if defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_3
#define spec_loop spec_loop_3
#define norm_loop_alpha1 norm_loop_alpha1_3
#define norm_loop norm_loop_3
#endif
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!rALPHA && iALPHA)
return 0;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 2;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0, jj = 0;
if(rALPHA == 1 && iALPHA == 0) {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[ii + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M); i++) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[iii + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M); i++) {
norm_loop_alpha1;
}
}
}
} else {
FLOAT rTmp, iTmp;
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[ii + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M); i++) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[iii + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M); i++) {
norm_loop;
}
}
}
}
return 0;
}

View File

@ -239,6 +239,22 @@ ifndef ZSWAPKERNEL
ZSWAPKERNEL = zswap_sse2.S ZSWAPKERNEL = zswap_sse2.S
endif endif
ifndef DGEMVNKERNEL
DGEMVNKERNEL = gemv_n_sse2.S
endif
ifndef DGEMVTKERNEL
DGEMVTKERNEL = gemv_t_sse2.S
endif
ifndef ZGEMVNKERNEL
ZGEMVNKERNEL = zgemv_n_sse2.S
endif
ifndef ZGEMVTKERNEL
ZGEMVTKERNEL = zgemv_t_sse2.S
endif
endif endif

79
param.h
View File

@ -1480,31 +1480,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL #define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 32 #define ZGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_P 32 #define ZGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_P 108
#define ZGEMM_DEFAULT_P 112
#define SGEMM_DEFAULT_Q 116 #define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_P 44
#define CGEMM_DEFAULT_Q 144 #define CGEMM_DEFAULT_P 64
#define ZGEMM_DEFAULT_Q 72 #define ZGEMM_DEFAULT_P 32
#define SGEMM_DEFAULT_R 1000 #define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_Q 92
#define CGEMM_DEFAULT_R 2000 #define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_R 2000 #define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 640
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R 640
#define ZGEMM_DEFAULT_R 640
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16
#endif
#ifdef LOONGSON3B
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 24
#define CGEMM_DEFAULT_P 24
#define ZGEMM_DEFAULT_P 20
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 64
#define SGEMM_DEFAULT_R 512
#define DGEMM_DEFAULT_R 512
#define CGEMM_DEFAULT_R 512
#define ZGEMM_DEFAULT_R 512
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16 #define SYMV_P 16
#endif #endif

View File

@ -1301,6 +1301,8 @@
NC = 0 NC = 0
RESET = .TRUE. RESET = .TRUE.
ERRMAX = RZERO ERRMAX = RZERO
RALS = RONE
RBETS = RONE
* *
DO 100 IN = 1, NIDIM DO 100 IN = 1, NIDIM
N = IDIM( IN ) N = IDIM( IN )

View File

@ -1303,6 +1303,8 @@
NC = 0 NC = 0
RESET = .TRUE. RESET = .TRUE.
ERRMAX = RZERO ERRMAX = RZERO
RALS = RONE
RBETS = RONE
* *
DO 100 IN = 1, NIDIM DO 100 IN = 1, NIDIM
N = IDIM( IN ) N = IDIM( IN )