Merge branch 'release-0.1.0'
This commit is contained in:
@@ -1,4 +1,22 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.1.0
|
||||
23-Mar-2012
|
||||
common:
|
||||
* Set soname of shared library on Linux.
|
||||
* Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
|
||||
this flag to control the library name, e.g. libopenblas.a,
|
||||
libopenblas_ifort.a or libopenblas_omp.a.
|
||||
* Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule.
|
||||
The lib use single thread in GEMM function with small matrices.
|
||||
x86/x86_64:
|
||||
* Used GEMV SSE/SSE2 kernels on x86 32-bit.
|
||||
* Exported CBLAS functions in Windows DLL.
|
||||
MIPS64:
|
||||
* Completed Level-3 BLAS optimization on Loongson 3A CPU.
|
||||
* Improved GEMV performance on Loongson 3A CPU.
|
||||
* Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT)
|
||||
|
||||
====================================================================
|
||||
Version 0.1 alpha2.5
|
||||
19-Feb-2012
|
||||
|
||||
26
Makefile
26
Makefile
@@ -82,27 +82,28 @@ endif
|
||||
shared :
|
||||
ifeq ($(OSNAME), Linux)
|
||||
$(MAKE) -C exports so
|
||||
-ln -fs $(LIBSONAME) libopenblas.so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
$(MAKE) -C exports so
|
||||
-ln -fs $(LIBSONAME) libopenblas.so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
$(MAKE) -C exports so
|
||||
-ln -fs $(LIBSONAME) libopenblas.so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
$(MAKE) -C exports dyn
|
||||
-ln -fs $(LIBDYNNAME) libopenblas.dylib
|
||||
-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
$(MAKE) -C exports dll
|
||||
-ln -fs $(LIBDLLNAME) libopenblas.dll
|
||||
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
$(MAKE) -C exports dll
|
||||
-ln -fs $(LIBDLLNAME) libopenblas.dll
|
||||
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
|
||||
endif
|
||||
|
||||
tests :
|
||||
@@ -130,7 +131,7 @@ endif
|
||||
ifeq ($(NOFORTRAN), 1)
|
||||
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
|
||||
endif
|
||||
-ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
|
||||
-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
@@ -158,7 +159,7 @@ endif
|
||||
prof : prof_blas prof_lapack
|
||||
|
||||
prof_blas :
|
||||
ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d prof || exit 1 ; \
|
||||
@@ -169,7 +170,7 @@ ifdef DYNAMIC_ARCH
|
||||
endif
|
||||
|
||||
blas :
|
||||
ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d libs || exit 1 ; \
|
||||
@@ -177,7 +178,7 @@ blas :
|
||||
done
|
||||
|
||||
hpl :
|
||||
ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
@@ -191,7 +192,7 @@ ifdef DYNAMIC_ARCH
|
||||
endif
|
||||
|
||||
hpl_p :
|
||||
ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
@@ -285,7 +286,8 @@ clean ::
|
||||
#ifdef DYNAMIC_ARCH
|
||||
@$(MAKE) -C kernel clean
|
||||
#endif
|
||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h
|
||||
@$(MAKE) -C reference clean
|
||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
||||
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
||||
@if test -d lapack-3.4.0; then \
|
||||
echo deleting lapack-3.4.0; \
|
||||
|
||||
@@ -38,33 +38,34 @@ install : lib.grd
|
||||
#for install static library
|
||||
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
|
||||
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.$(LIBSUFFIX)
|
||||
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX)
|
||||
#for install shared library
|
||||
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), Linux)
|
||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dylib
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
|
||||
endif
|
||||
|
||||
@echo Install OK!
|
||||
|
||||
@@ -3,7 +3,12 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.1alpha2.5
|
||||
VERSION = 0.1.0
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
# is libopenblas_$(LIBNAMESUFFIX).so.0.
|
||||
# LIBNAMESUFFIX = omp
|
||||
|
||||
# You can specify the target architecture, otherwise it's
|
||||
# automatically detected.
|
||||
@@ -83,6 +88,11 @@ VERSION = 0.1alpha2.5
|
||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
||||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
||||
# in small matrix sizes. The default value is 4.
|
||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||
|
||||
# If you need santy check by comparing reference BLAS. It'll be very
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
@@ -40,6 +40,11 @@ ifdef INTERFACE64
|
||||
GETARCH_FLAGS += -DUSE64BITINT
|
||||
endif
|
||||
|
||||
ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||
GEMM_MULTITHREAD_THRESHOLD=4
|
||||
endif
|
||||
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
|
||||
|
||||
# This operation is expensive, so execution should be once.
|
||||
ifndef GOTOBLAS_MAKEFILE
|
||||
export GOTOBLAS_MAKEFILE = 1
|
||||
@@ -274,7 +279,12 @@ endif
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
@@ -341,7 +351,8 @@ endif
|
||||
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
FCOMMON_OPT += -Wall
|
||||
FCOMMON_OPT += -Wall
|
||||
EXTRALIB += -lgfortran
|
||||
ifdef NO_BINARY_MODE
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifdef BINARY64
|
||||
@@ -528,8 +539,10 @@ ifdef SMP
|
||||
CCOMMON_OPT += -DSMP_SERVER
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
@@ -568,7 +581,11 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifndef LIBNAMESUFFIX
|
||||
LIBPREFIX = libopenblas
|
||||
else
|
||||
LIBPREFIX = libopenblas_$(LIBNAMESUFFIX)
|
||||
endif
|
||||
|
||||
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
||||
|
||||
@@ -590,9 +607,11 @@ endif
|
||||
|
||||
ifneq ($(ARCH), x86_64)
|
||||
ifneq ($(ARCH), x86)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
NO_AFFINITY = 1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef NO_AFFINITY
|
||||
CCOMMON_OPT += -DNO_AFFINITY
|
||||
@@ -636,6 +655,7 @@ MD5SUM = md5sum
|
||||
AWK = awk
|
||||
|
||||
REVISION = -r$(VERSION)
|
||||
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
|
||||
|
||||
CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||
PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||
|
||||
1
README
1
README
@@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
|
||||
9.Known Issues
|
||||
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
|
||||
is 64. On 32 bits, it is 32.
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
|
||||
|
||||
10. Specification of Git Branches
|
||||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
||||
|
||||
@@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...);
|
||||
static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
unsigned long *nodemask, unsigned long maxnode,
|
||||
unsigned flags) {
|
||||
#if defined (LOONGSON3B)
|
||||
#if defined (__64BIT__)
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#else
|
||||
return 0; //NULL Implementation on Loongson 3B 32bit.
|
||||
#endif
|
||||
#else
|
||||
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
|
||||
unsigned long null_nodemask=0;
|
||||
return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags);
|
||||
// unsigned long null_nodemask=0;
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||
|
||||
@@ -2127,7 +2127,9 @@
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sgemm_p;
|
||||
extern BLASLONG sgemm_q;
|
||||
extern BLASLONG sgemm_r;
|
||||
|
||||
@@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){
|
||||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
#if defined(LOONGSON3A)
|
||||
unsigned long long tmp;
|
||||
__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
|
||||
ret=tmp;
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
// unsigned long long tmp;
|
||||
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
|
||||
//ret=tmp;
|
||||
__asm__ __volatile__(".set push \n"
|
||||
".set mips32r2\n"
|
||||
"rdhwr %0, $2\n"
|
||||
".set pop": "=r"(ret):: "memory");
|
||||
|
||||
#else
|
||||
__asm__ __volatile__(".set push \n"
|
||||
".set mips32r2\n"
|
||||
@@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
#ifndef NO_AFFINITY
|
||||
#define WHEREAMI
|
||||
static inline int WhereAmI(void){
|
||||
int ret=0;
|
||||
__asm__ __volatile__(".set push \n"
|
||||
".set mips32r2\n"
|
||||
"rdhwr %0, $0\n"
|
||||
".set pop": "=r"(ret):: "memory");
|
||||
return ret;
|
||||
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
@@ -152,6 +172,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define CMPEQ c.eq.d
|
||||
#define CMPLE c.le.d
|
||||
#define CMPLT c.lt.d
|
||||
#define NEG neg.d
|
||||
#else
|
||||
#define LD lwc1
|
||||
#define ST swc1
|
||||
@@ -170,6 +191,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define CMPEQ c.eq.s
|
||||
#define CMPLE c.le.s
|
||||
#define CMPLT c.lt.s
|
||||
#define PLU plu.ps
|
||||
#define PLL pll.ps
|
||||
#define PUU puu.ps
|
||||
#define PUL pul.ps
|
||||
#define MADPS madd.ps
|
||||
#define CVTU cvt.s.pu
|
||||
#define CVTL cvt.s.pl
|
||||
#define NEG neg.s
|
||||
#endif
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
@@ -218,13 +247,18 @@ REALNAME: ;\
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 8 << 20)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#define PAGESIZE (32UL << 10)
|
||||
#define FIXED_PAGESIZE (32UL << 10)
|
||||
#endif
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE (64UL << 10)
|
||||
#endif
|
||||
@@ -236,7 +270,7 @@ REALNAME: ;\
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
#define PREFETCHD_(x) ld $0, x
|
||||
#define PREFETCHD(x) PREFETCHD_(x)
|
||||
#else
|
||||
|
||||
19
cpuid_mips.c
19
cpuid_mips.c
@@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3A 2
|
||||
#define CPU_LOONGSON3B 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A"
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
@@ -101,6 +103,8 @@ int detect(void){
|
||||
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
@@ -130,6 +134,8 @@ void get_architecture(void){
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("LOONGSON3A");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("LOONGSON3B");
|
||||
}else{
|
||||
printf("SICORTEX");
|
||||
}
|
||||
@@ -149,6 +155,15 @@ void get_cpuconfig(void){
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("#define LOONGSON3B\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else{
|
||||
printf("#define SICORTEX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
@@ -164,6 +179,8 @@ void get_cpuconfig(void){
|
||||
void get_libname(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("loongson3a\n");
|
||||
}else if(detect()==CPU_LOONGSON3B) {
|
||||
printf("loongson3b\n");
|
||||
}else{
|
||||
#ifdef __mips64
|
||||
printf("mips64\n");
|
||||
|
||||
@@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
range_M[0] = 0;
|
||||
i = arg -> m;
|
||||
} else {
|
||||
range_M[0] = range_M[0];
|
||||
i = range_M[1] - range_M[0];
|
||||
range_M[0] = range_m[0];
|
||||
i = range_m[1] - range_m[0];
|
||||
}
|
||||
|
||||
num_cpu_m = 0;
|
||||
|
||||
@@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||
queue[num_cpu].args = arg;
|
||||
queue[num_cpu].range_m = range_m;
|
||||
queue[num_cpu].range_n = &range[num_cpu];
|
||||
queue[num_cpu].sa = NULL;
|
||||
#if defined(LOONGSON3A)
|
||||
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
|
||||
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
|
||||
#else
|
||||
queue[num_cpu].sa = NULL;
|
||||
queue[num_cpu].sb = NULL;
|
||||
#endif
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
num_cpu ++;
|
||||
}
|
||||
|
||||
if (num_cpu) {
|
||||
#if defined(LOONGSON3A)
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sb;
|
||||
|
||||
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
|
||||
#else
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sb;
|
||||
#endif
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas(num_cpu,
|
||||
|
||||
@@ -55,8 +55,8 @@ int CNAME(int mode,
|
||||
range_M[0] = 0;
|
||||
i = arg -> m;
|
||||
} else {
|
||||
range_M[0] = range_M[0];
|
||||
i = range_M[1] - range_M[0];
|
||||
range_M[0] = range_m[0];
|
||||
i = range_m[1] - range_m[0];
|
||||
}
|
||||
|
||||
num_cpu_m = 0;
|
||||
|
||||
@@ -500,6 +500,7 @@ static int blas_monitor(void *arg){
|
||||
/* Initializing routine */
|
||||
int blas_thread_init(void){
|
||||
BLASLONG i;
|
||||
int ret;
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_t attr;
|
||||
#endif
|
||||
@@ -545,12 +546,16 @@ int blas_thread_init(void){
|
||||
pthread_cond_init (&thread_status[i].wakeup, NULL);
|
||||
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_create(&blas_threads[i], &attr,
|
||||
ret=pthread_create(&blas_threads[i], &attr,
|
||||
(void *)&blas_thread_server, (void *)i);
|
||||
#else
|
||||
pthread_create(&blas_threads[i], NULL,
|
||||
ret=pthread_create(&blas_threads[i], NULL,
|
||||
(void *)&blas_thread_server, (void *)i);
|
||||
#endif
|
||||
if(ret!=0){
|
||||
fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MONITOR
|
||||
@@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) {
|
||||
|
||||
blas_cpu_number = num_threads;
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void openblas_set_num_threads(int num_threads) {
|
||||
|
||||
@@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) {
|
||||
|
||||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
||||
}
|
||||
void openblas_set_num_threads(int num_threads) {
|
||||
|
||||
|
||||
@@ -390,11 +390,11 @@ static void *alloc_mmap(void *address){
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#ifdef DEBUG
|
||||
int ret;
|
||||
int ret=0;
|
||||
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||
if(ret==-1){
|
||||
int errsv=errno;
|
||||
perror("alloc_mmap:");
|
||||
perror("OpenBLAS alloc_mmap:");
|
||||
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
|
||||
}
|
||||
|
||||
@@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){
|
||||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
||||
@@ -45,8 +45,22 @@ int get_L2_size(void);
|
||||
#define DEFAULT_GEMM_P 128
|
||||
#define DEFAULT_GEMM_Q 128
|
||||
#define DEFAULT_GEMM_R 128
|
||||
#define DEFAULT_GEMM_OFFSET_A 0
|
||||
#define DEFAULT_GEMM_OFFSET_B 0
|
||||
|
||||
/* Global Parameter */
|
||||
#if GEMM_OFFSET_A == gemm_offset_a
|
||||
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
|
||||
#else
|
||||
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
|
||||
#endif
|
||||
|
||||
#if GEMM_OFFSET_B == gemm_offset_b
|
||||
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
|
||||
#else
|
||||
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
||||
#endif
|
||||
|
||||
#if SGEMM_P == sgemm_p
|
||||
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
||||
#else
|
||||
@@ -666,3 +680,36 @@ void blas_set_parameter(void){
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
void blas_set_parameter(void){
|
||||
#if defined(LOONGSON3A)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1){
|
||||
#endif
|
||||
//single thread
|
||||
dgemm_r = 1024;
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
dgemm_r = 200;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1 || blas_num_threads == 2){
|
||||
#endif
|
||||
//single thread
|
||||
dgemm_r = 640;
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
dgemm_r = 160;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -58,16 +58,16 @@ dll : ../$(LIBDLLNAME)
|
||||
|
||||
dll2 : libgoto2_shared.dll
|
||||
|
||||
../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
|
||||
../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
|
||||
$(RANLIB) ../$(LIBNAME)
|
||||
ifeq ($(BINARY32), 1)
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
||||
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
|
||||
-lib /machine:i386 /def:libgoto2.def
|
||||
-lib /machine:i386 /def:libopenblas.def
|
||||
else
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
||||
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
|
||||
-lib /machine:X64 /def:libgoto2.def
|
||||
-lib /machine:X64 /def:libopenblas.def
|
||||
endif
|
||||
|
||||
libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
|
||||
@@ -75,7 +75,7 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
|
||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
|
||||
|
||||
libgoto2.def : gensymbol
|
||||
libopenblas.def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||
|
||||
libgoto2_shared.def : gensymbol
|
||||
@@ -100,7 +100,7 @@ so : ../$(LIBSONAME)
|
||||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
||||
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||
-Wl,--retain-symbols-file=linux.def $(EXTRALIB)
|
||||
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
|
||||
@@ -301,7 +301,7 @@
|
||||
if ($ARGV[5] == 1) {
|
||||
#NO_LAPACK=1
|
||||
@objs = (@blasobjs);
|
||||
} elsif (-d "../lapack-3.1.1") {
|
||||
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") {
|
||||
@objs = (@blasobjs, @lapackobjs, @lapackobjs2);
|
||||
} else {
|
||||
@objs = (@blasobjs, @lapackobjs);
|
||||
@@ -389,6 +389,13 @@ if ($ARGV[0] eq "win2k"){
|
||||
$count ++;
|
||||
}
|
||||
|
||||
if ($ARGV[4] == 0) {
|
||||
foreach $objs (@cblasobjs) {
|
||||
print "\t",$objs,"=$objs"," \@", $count, "\n";
|
||||
$count ++;
|
||||
}
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
4
f_check
4
f_check
@@ -284,6 +284,10 @@ if ($link ne "") {
|
||||
|
||||
}
|
||||
|
||||
if ($vendor eq "INTEL"){
|
||||
$linker_a .= "-lgfortran"
|
||||
}
|
||||
|
||||
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
|
||||
open(CONFFILE, ">> $config" ) || die "Can't append $config";
|
||||
|
||||
|
||||
15
getarch.c
15
getarch.c
@@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/* #define FORCE_CELL */
|
||||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3A */
|
||||
/* #define FORCE_LOONGSON3B */
|
||||
/* #define FORCE_ITANIUM2 */
|
||||
/* #define FORCE_GENERIC */
|
||||
/* #define FORCE_SPARC */
|
||||
@@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSON3B
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "LOONGSON3B"
|
||||
#define SUBDIRNAME "mips64"
|
||||
#define ARCHCONFIG "-DLOONGSON3B " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "loongson3b"
|
||||
#define CORENAME "LOONGSON3B"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ITANIUM2
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "IA64"
|
||||
|
||||
@@ -34,6 +34,7 @@ int main(int argc, char **argv) {
|
||||
#ifdef USE64BITINT
|
||||
printf("#define USE64BITINT\n");
|
||||
#endif
|
||||
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -770,20 +770,36 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c
|
||||
xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
|
||||
$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
|
||||
|
||||
ifndef USE_NETLIB_GEMV
|
||||
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
||||
dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
else
|
||||
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
|
||||
dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
endif
|
||||
|
||||
qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
||||
|
||||
ifndef USE_NETLIB_GEMV
|
||||
cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
||||
zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
else
|
||||
cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
|
||||
zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
endif
|
||||
|
||||
xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
||||
@@ -397,8 +397,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
||||
if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|
||||
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
|
||||
args.nthreads = 1;
|
||||
}else{
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
}
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
|
||||
285
interface/netlib/cgemv.f
Normal file
285
interface/netlib/cgemv.f
Normal file
@@ -0,0 +1,285 @@
|
||||
SUBROUTINE CGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
COMPLEX ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
COMPLEX A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* CGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or
|
||||
*
|
||||
* y := alpha*A**H*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - COMPLEX .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - COMPLEX array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - COMPLEX array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - COMPLEX .
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - COMPLEX array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
COMPLEX ONE
|
||||
PARAMETER (ONE= (1.0E+0,0.0E+0))
|
||||
COMPLEX ZERO
|
||||
PARAMETER (ZERO= (0.0E+0,0.0E+0))
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
COMPLEX TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
LOGICAL NOCONJ
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC CONJG,MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('CGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
NOCONJ = LSAME(TRANS,'T')
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 110 J = 1,N
|
||||
TEMP = ZERO
|
||||
IF (NOCONJ) THEN
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
ELSE
|
||||
DO 100 I = 1,M
|
||||
TEMP = TEMP + CONJG(A(I,J))*X(I)
|
||||
100 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
110 CONTINUE
|
||||
ELSE
|
||||
DO 140 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
IF (NOCONJ) THEN
|
||||
DO 120 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
120 CONTINUE
|
||||
ELSE
|
||||
DO 130 I = 1,M
|
||||
TEMP = TEMP + CONJG(A(I,J))*X(IX)
|
||||
IX = IX + INCX
|
||||
130 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
140 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of CGEMV .
|
||||
*
|
||||
END
|
||||
265
interface/netlib/dgemv.f
Normal file
265
interface/netlib/dgemv.f
Normal file
@@ -0,0 +1,265 @@
|
||||
SUBROUTINE DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE PRECISION ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* DGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - DOUBLE PRECISION.
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - DOUBLE PRECISION array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - DOUBLE PRECISION.
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - DOUBLE PRECISION array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ONE,ZERO
|
||||
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('DGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP = ZERO
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
DO 120 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
DO 110 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of DGEMV .
|
||||
*
|
||||
END
|
||||
265
interface/netlib/sgemv.f
Normal file
265
interface/netlib/sgemv.f
Normal file
@@ -0,0 +1,265 @@
|
||||
SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
REAL ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
REAL A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* SGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - REAL .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - REAL array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - REAL array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - REAL .
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - REAL array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
REAL ONE,ZERO
|
||||
PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
REAL TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('SGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP = ZERO
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
DO 120 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
DO 110 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of SGEMV .
|
||||
*
|
||||
END
|
||||
285
interface/netlib/zgemv.f
Normal file
285
interface/netlib/zgemv.f
Normal file
@@ -0,0 +1,285 @@
|
||||
SUBROUTINE ZGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE COMPLEX ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE COMPLEX A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* ZGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or
|
||||
*
|
||||
* y := alpha*A**H*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - COMPLEX*16 .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - COMPLEX*16 array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - COMPLEX*16 array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - COMPLEX*16 .
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - COMPLEX*16 array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE COMPLEX ONE
|
||||
PARAMETER (ONE= (1.0D+0,0.0D+0))
|
||||
DOUBLE COMPLEX ZERO
|
||||
PARAMETER (ZERO= (0.0D+0,0.0D+0))
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE COMPLEX TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
LOGICAL NOCONJ
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC DCONJG,MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('ZGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
NOCONJ = LSAME(TRANS,'T')
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 110 J = 1,N
|
||||
TEMP = ZERO
|
||||
IF (NOCONJ) THEN
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
ELSE
|
||||
DO 100 I = 1,M
|
||||
TEMP = TEMP + DCONJG(A(I,J))*X(I)
|
||||
100 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
110 CONTINUE
|
||||
ELSE
|
||||
DO 140 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
IF (NOCONJ) THEN
|
||||
DO 120 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
120 CONTINUE
|
||||
ELSE
|
||||
DO 130 I = 1,M
|
||||
TEMP = TEMP + DCONJG(A(I,J))*X(IX)
|
||||
IX = IX + INCX
|
||||
130 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
140 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of ZGEMV .
|
||||
*
|
||||
END
|
||||
@@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO,
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
@@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO,
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
@@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
@@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
|
||||
@@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
||||
$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
|
||||
ifeq ($(TARGET), LOONGSON3B)
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
@@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
157
kernel/generic/gemmkernel_2x2.c
Normal file
157
kernel/generic/gemmkernel_2x2.c
Normal file
@@ -0,0 +1,157 @@
|
||||
#include "common.h"
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
for (k=0; k<bk/4; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*1+0];
|
||||
load5 = ptrbb[2*1+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*1+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*1+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
load0 = ptrba[2*2+0];
|
||||
load1 = ptrbb[2*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*2+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*3+0];
|
||||
load5 = ptrbb[2*3+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*3+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*3+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
for (k=0; k<(bk&3); k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = C0[1]+res1;
|
||||
res2 = res2*alpha;
|
||||
C1[0] = C1[0]+res2;
|
||||
res3 = res3*alpha;
|
||||
C1[1] = C1[1]+res3;
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrbb[2*0+1];
|
||||
res1 = res1+load0*load2;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
res1 = res1*alpha;
|
||||
C1[0] = C1[0]+res1;
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
}
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = C0[1]+res1;
|
||||
C0 = C0+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
C0 = C0+1;
|
||||
}
|
||||
k = (bk<<0);
|
||||
bb = bb+k;
|
||||
C = C+ldc;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
280
kernel/generic/trmmkernel_2x2.c
Normal file
280
kernel/generic/trmmkernel_2x2.c
Normal file
@@ -0,0 +1,280 @@
|
||||
#include "common.h"
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
|
||||
BLASLONG off, temp;
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#endif
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || \
|
||||
(!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
for (k=0; k<temp/4; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*1+0];
|
||||
load5 = ptrbb[2*1+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*1+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*1+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
load0 = ptrba[2*2+0];
|
||||
load1 = ptrbb[2*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*2+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*3+0];
|
||||
load5 = ptrbb[2*3+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*3+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*3+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
for (k=0; k<(temp&3); k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = res1;
|
||||
res2 = res2*alpha;
|
||||
C1[0] = res2;
|
||||
res3 = res3*alpha;
|
||||
C1[1] = res3;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off;
|
||||
ptrbb = bb+off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrbb[2*0+1];
|
||||
res1 = res1+load0*load2;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
res1 = res1*alpha;
|
||||
C1[0] = res1;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || \
|
||||
(!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2;
|
||||
#else
|
||||
temp = off+1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = res1;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
C0 = C0+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off;
|
||||
ptrbb = bb+off;
|
||||
#endif
|
||||
res0 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 1;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp;
|
||||
ptrbb += temp;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+1;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1;
|
||||
#endif
|
||||
k = (bk<<0);
|
||||
bb = bb+k;
|
||||
C = C+ldc;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
838
kernel/generic/zgemmkernel_2x2.c
Normal file
838
kernel/generic/zgemmkernel_2x2.c
Normal file
@@ -0,0 +1,838 @@
|
||||
#include "common.h"
|
||||
/********************************
|
||||
ADD1 a*c
|
||||
ADD2 b*c
|
||||
ADD3 a*d
|
||||
ADD4 b*d
|
||||
*********************************/
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
, BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+2*ldc;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
res4 = 0;
|
||||
res5 = 0;
|
||||
res6 = 0;
|
||||
res7 = 0;
|
||||
for (k=0; k<bk/4; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
ptrba = ptrba+16;
|
||||
ptrbb = ptrbb+16;
|
||||
}
|
||||
for (k=0; k<(bk&3); k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
load2 = res2*alphar;
|
||||
C0[2] = C0[2]+load2;
|
||||
load3 = res3*alphar;
|
||||
C0[3] = C0[3]+load3;
|
||||
load2 = res3*alphai;
|
||||
C0[2] = C0[2]-load2;
|
||||
load3 = res2*alphai;
|
||||
C0[3] = C0[3]+load3;
|
||||
load4 = res4*alphar;
|
||||
C1[0] = C1[0]+load4;
|
||||
load5 = res5*alphar;
|
||||
C1[1] = C1[1]+load5;
|
||||
load4 = res5*alphai;
|
||||
C1[0] = C1[0]-load4;
|
||||
load5 = res4*alphai;
|
||||
C1[1] = C1[1]+load5;
|
||||
load6 = res6*alphar;
|
||||
C1[2] = C1[2]+load6;
|
||||
load7 = res7*alphar;
|
||||
C1[3] = C1[3]+load7;
|
||||
load6 = res7*alphai;
|
||||
C1[2] = C1[2]-load6;
|
||||
load7 = res6*alphai;
|
||||
C1[3] = C1[3]+load7;
|
||||
C0 = C0+4;
|
||||
C1 = C1+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
load2 = res2*alphar;
|
||||
C1[0] = C1[0]+load2;
|
||||
load3 = res3*alphar;
|
||||
C1[1] = C1[1]+load3;
|
||||
load2 = res3*alphai;
|
||||
C1[0] = C1[0]-load2;
|
||||
load3 = res2*alphai;
|
||||
C1[1] = C1[1]+load3;
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
k = (bk<<2);
|
||||
bb = bb+k;
|
||||
i = (ldc<<2);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
load2 = res2*alphar;
|
||||
C0[2] = C0[2]+load2;
|
||||
load3 = res3*alphar;
|
||||
C0[3] = C0[3]+load3;
|
||||
load2 = res3*alphai;
|
||||
C0[2] = C0[2]-load2;
|
||||
load3 = res2*alphai;
|
||||
C0[3] = C0[3]+load3;
|
||||
C0 = C0+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
C0 = C0+2;
|
||||
}
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
923
kernel/generic/ztrmmkernel_2x2.c
Normal file
923
kernel/generic/ztrmmkernel_2x2.c
Normal file
@@ -0,0 +1,923 @@
|
||||
#include "common.h"
|
||||
/********************************
|
||||
ADD1 a*c
|
||||
ADD2 b*c
|
||||
ADD3 a*d
|
||||
ADD4 b*d
|
||||
*********************************/
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
|
||||
FLOAT* C,BLASLONG ldc, BLASLONG offset)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
|
||||
BLASLONG off, temp;
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#endif
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
C0 = C;
|
||||
C1 = C0+2*ldc;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb+off*2*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
res4 = 0;
|
||||
res5 = 0;
|
||||
res6 = 0;
|
||||
res7 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 2;
|
||||
#else
|
||||
temp = off + 2;
|
||||
#endif
|
||||
for (k=0; k<temp/4; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
ptrba = ptrba+16;
|
||||
ptrbb = ptrbb+16;
|
||||
}
|
||||
for (k=0; k<(temp&3); k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
|
||||
load2 = res2*alphar-res3*alphai;
|
||||
load3 = res3*alphar+res2*alphai;
|
||||
C0[2] = load2;
|
||||
C0[3] = load3;
|
||||
|
||||
load4 = res4*alphar-res5*alphai;
|
||||
load5 = res5*alphar+res4*alphai;
|
||||
C1[0] = load4;
|
||||
C1[1] = load5;
|
||||
|
||||
load6 = res6*alphar-res7*alphai;
|
||||
load7 = res7*alphar+res6*alphai;
|
||||
C1[2] = load6;
|
||||
C1[3] = load7;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
C0 = C0+4;
|
||||
C1 = C1+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
|
||||
load2 = res2*alphar-res3*alphai;
|
||||
load3 = res3*alphar+res2*alphai;
|
||||
C1[0] = load2;
|
||||
C1[1] = load3;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
k = (bk<<2);
|
||||
bb = bb+k;
|
||||
i = (ldc<<2);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb+off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 2;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
|
||||
load2 = res2*alphar-res3*alphai;
|
||||
load3 = res3*alphar+res2*alphai;
|
||||
C0[2] = load2;
|
||||
C0[3] = load3;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
C0 = C0+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 1;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
}
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LN
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LT
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RN
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RT
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LN
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LT
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RN
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RT
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
|
||||
@@ -1,18 +1,48 @@
|
||||
SAXPYKERNEL=axpy_loongson3a.S
|
||||
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_loongson3a.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = gemm_kernel_loongson3a.S
|
||||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
@@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
64
kernel/mips64/KERNEL.LOONGSON3B
Normal file
64
kernel/mips64/KERNEL.LOONGSON3B
Normal file
@@ -0,0 +1,64 @@
|
||||
SAXPYKERNEL=axpy_loongson3a.S
|
||||
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
1468
kernel/mips64/cgemm_kernel_loongson3a_2x2.S
Normal file
1468
kernel/mips64/cgemm_kernel_loongson3a_2x2.S
Normal file
File diff suppressed because it is too large
Load Diff
4026
kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S
Normal file
4026
kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S
Normal file
File diff suppressed because it is too large
Load Diff
1468
kernel/mips64/cgemm_kernel_loongson3b_2x2.S
Normal file
1468
kernel/mips64/cgemm_kernel_loongson3b_2x2.S
Normal file
File diff suppressed because it is too large
Load Diff
101
kernel/mips64/gemv_n_loongson3a.c
Normal file
101
kernel/mips64/gemv_n_loongson3a.c
Normal file
@@ -0,0 +1,101 @@
|
||||
#include "common.h"
|
||||
|
||||
//These are auto-tuning codes on Loongson-3A platform.
|
||||
|
||||
//#define prefetch(x) __builtin_prefetch(x)
|
||||
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
|
||||
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
|
||||
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
|
||||
{
|
||||
|
||||
BLASLONG kx=0, ky=0;
|
||||
if(!ALPHA)
|
||||
return 0;
|
||||
|
||||
//if(INCX < 0)
|
||||
// kx = (1-N) * INCX;
|
||||
// INCX = -INCX;
|
||||
//if(INCY < 0)
|
||||
// ky = (1-M) * INCY;
|
||||
// INCY = -INCY;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 4;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0;
|
||||
|
||||
if(ALPHA == 1) {
|
||||
if(INCY == 1) {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[i + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0, h = ky;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[h + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(INCY == 1) {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[i + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0, h = ky;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[h + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
93
kernel/mips64/gemv_t_loongson3a.c
Normal file
93
kernel/mips64/gemv_t_loongson3a.c
Normal file
@@ -0,0 +1,93 @@
|
||||
#include "common.h"
|
||||
|
||||
//These are auto-tuning codes on Loongson-3A platform.
|
||||
|
||||
//#define prefetch(x) __builtin_prefetch(x)
|
||||
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
|
||||
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
|
||||
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!ALPHA)
|
||||
return 0;
|
||||
|
||||
// if(INCX < 0)
|
||||
// INCX = -INCX;
|
||||
// if(INCY < 0)
|
||||
// INCY = -INCY;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 3;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0;
|
||||
|
||||
if(ALPHA == 1) {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[i + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0, h = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[h + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[i + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0, h = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[h + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
7797
kernel/mips64/sgemm_kernel_8x4_ps.S
Normal file
7797
kernel/mips64/sgemm_kernel_8x4_ps.S
Normal file
File diff suppressed because it is too large
Load Diff
2579
kernel/mips64/sgemm_kernel_loongson3a_4x4.S
Normal file
2579
kernel/mips64/sgemm_kernel_loongson3a_4x4.S
Normal file
File diff suppressed because it is too large
Load Diff
2579
kernel/mips64/sgemm_kernel_loongson3b_4x4.S
Normal file
2579
kernel/mips64/sgemm_kernel_loongson3b_4x4.S
Normal file
File diff suppressed because it is too large
Load Diff
1355
kernel/mips64/zgemm_kernel_loongson3a_2x2.S
Normal file
1355
kernel/mips64/zgemm_kernel_loongson3a_2x2.S
Normal file
File diff suppressed because it is too large
Load Diff
1468
kernel/mips64/zgemm_kernel_loongson3b_2x2.S
Normal file
1468
kernel/mips64/zgemm_kernel_loongson3b_2x2.S
Normal file
File diff suppressed because it is too large
Load Diff
139
kernel/mips64/zgemv_n_loongson3a.c
Normal file
139
kernel/mips64/zgemv_n_loongson3a.c
Normal file
@@ -0,0 +1,139 @@
|
||||
#include "common.h"
|
||||
|
||||
//typedef int BLASLONG;
|
||||
//typedef double FLOAT;
|
||||
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||
#define spec_loop spec_loop_0
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||
#define norm_loop norm_loop_0
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||
#define spec_loop spec_loop_1
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||
#define norm_loop norm_loop_1
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||
#define spec_loop spec_loop_2
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||
#define norm_loop norm_loop_2
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||
#define spec_loop spec_loop_3
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||
#define norm_loop norm_loop_3
|
||||
#endif
|
||||
|
||||
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!rALPHA && iALPHA)
|
||||
return 0;
|
||||
|
||||
BLASLONG fahead = 60;
|
||||
BLASLONG spec_unroll = 2;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0, jj = 0;
|
||||
|
||||
if(rALPHA == 1 && iALPHA == 0) {
|
||||
if(INCY == 1) {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FLOAT rTmp, iTmp;
|
||||
if(INCY == 1) {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
125
kernel/mips64/zgemv_t_loongson3a.c
Normal file
125
kernel/mips64/zgemv_t_loongson3a.c
Normal file
@@ -0,0 +1,125 @@
|
||||
#include "common.h"
|
||||
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||
#define spec_loop spec_loop_0
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||
#define norm_loop norm_loop_0
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||
#define spec_loop spec_loop_1
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||
#define norm_loop norm_loop_1
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||
#define spec_loop spec_loop_2
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||
#define norm_loop norm_loop_2
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||
#define spec_loop spec_loop_3
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||
#define norm_loop norm_loop_3
|
||||
#endif
|
||||
|
||||
|
||||
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
|
||||
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!rALPHA && iALPHA)
|
||||
return 0;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 2;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0, jj = 0;
|
||||
|
||||
if(rALPHA == 1 && iALPHA == 0) {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FLOAT rTmp, iTmp;
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -239,6 +239,22 @@ ifndef ZSWAPKERNEL
|
||||
ZSWAPKERNEL = zswap_sse2.S
|
||||
endif
|
||||
|
||||
ifndef DGEMVNKERNEL
|
||||
DGEMVNKERNEL = gemv_n_sse2.S
|
||||
endif
|
||||
|
||||
ifndef DGEMVTKERNEL
|
||||
DGEMVTKERNEL = gemv_t_sse2.S
|
||||
endif
|
||||
|
||||
ifndef ZGEMVNKERNEL
|
||||
ZGEMVNKERNEL = zgemv_n_sse2.S
|
||||
endif
|
||||
|
||||
ifndef ZGEMVTKERNEL
|
||||
ZGEMVTKERNEL = zgemv_t_sse2.S
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
|
||||
79
param.h
79
param.h
@@ -1480,31 +1480,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 1
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 1
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 32
|
||||
#define DGEMM_DEFAULT_P 32
|
||||
#define CGEMM_DEFAULT_P 108
|
||||
#define ZGEMM_DEFAULT_P 112
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_Q 116
|
||||
#define DGEMM_DEFAULT_Q 116
|
||||
#define CGEMM_DEFAULT_Q 144
|
||||
#define ZGEMM_DEFAULT_Q 72
|
||||
#define SGEMM_DEFAULT_P 64
|
||||
#define DGEMM_DEFAULT_P 44
|
||||
#define CGEMM_DEFAULT_P 64
|
||||
#define ZGEMM_DEFAULT_P 32
|
||||
|
||||
#define SGEMM_DEFAULT_R 1000
|
||||
#define DGEMM_DEFAULT_R 1000
|
||||
#define CGEMM_DEFAULT_R 2000
|
||||
#define ZGEMM_DEFAULT_R 2000
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 92
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 80
|
||||
|
||||
#define SGEMM_DEFAULT_R 640
|
||||
#define DGEMM_DEFAULT_R dgemm_r
|
||||
#define CGEMM_DEFAULT_R 640
|
||||
#define ZGEMM_DEFAULT_R 640
|
||||
|
||||
#define GEMM_OFFSET_A1 0x10000
|
||||
#define GEMM_OFFSET_B1 0x100000
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
#ifdef LOONGSON3B
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 64
|
||||
#define DGEMM_DEFAULT_P 24
|
||||
#define CGEMM_DEFAULT_P 24
|
||||
#define ZGEMM_DEFAULT_P 20
|
||||
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 64
|
||||
|
||||
#define SGEMM_DEFAULT_R 512
|
||||
#define DGEMM_DEFAULT_R 512
|
||||
#define CGEMM_DEFAULT_R 512
|
||||
#define ZGEMM_DEFAULT_R 512
|
||||
|
||||
#define GEMM_OFFSET_A1 0x10000
|
||||
#define GEMM_OFFSET_B1 0x100000
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
@@ -1301,6 +1301,8 @@
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -1303,6 +1303,8 @@
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
Reference in New Issue
Block a user