Merge branch 'release-0.1.0'
This commit is contained in:
commit
09f74f6d23
|
@ -1,4 +1,22 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.1.0
|
||||
23-Mar-2012
|
||||
common:
|
||||
* Set soname of shared library on Linux.
|
||||
* Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
|
||||
this flag to control the library name, e.g. libopenblas.a,
|
||||
libopenblas_ifort.a or libopenblas_omp.a.
|
||||
* Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule.
|
||||
The lib use single thread in GEMM function with small matrices.
|
||||
x86/x86_64:
|
||||
* Used GEMV SSE/SSE2 kernels on x86 32-bit.
|
||||
* Exported CBLAS functions in Windows DLL.
|
||||
MIPS64:
|
||||
* Completed Level-3 BLAS optimization on Loongson 3A CPU.
|
||||
* Improved GEMV performance on Loongson 3A CPU.
|
||||
* Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT)
|
||||
|
||||
====================================================================
|
||||
Version 0.1 alpha2.5
|
||||
19-Feb-2012
|
||||
|
|
26
Makefile
26
Makefile
|
@ -82,27 +82,28 @@ endif
|
|||
shared :
|
||||
ifeq ($(OSNAME), Linux)
|
||||
$(MAKE) -C exports so
|
||||
-ln -fs $(LIBSONAME) libopenblas.so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
$(MAKE) -C exports so
|
||||
-ln -fs $(LIBSONAME) libopenblas.so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
$(MAKE) -C exports so
|
||||
-ln -fs $(LIBSONAME) libopenblas.so
|
||||
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
$(MAKE) -C exports dyn
|
||||
-ln -fs $(LIBDYNNAME) libopenblas.dylib
|
||||
-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
$(MAKE) -C exports dll
|
||||
-ln -fs $(LIBDLLNAME) libopenblas.dll
|
||||
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
$(MAKE) -C exports dll
|
||||
-ln -fs $(LIBDLLNAME) libopenblas.dll
|
||||
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
|
||||
endif
|
||||
|
||||
tests :
|
||||
|
@ -130,7 +131,7 @@ endif
|
|||
ifeq ($(NOFORTRAN), 1)
|
||||
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
|
||||
endif
|
||||
-ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
|
||||
-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
@ -158,7 +159,7 @@ endif
|
|||
prof : prof_blas prof_lapack
|
||||
|
||||
prof_blas :
|
||||
ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d prof || exit 1 ; \
|
||||
|
@ -169,7 +170,7 @@ ifdef DYNAMIC_ARCH
|
|||
endif
|
||||
|
||||
blas :
|
||||
ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d libs || exit 1 ; \
|
||||
|
@ -177,7 +178,7 @@ blas :
|
|||
done
|
||||
|
||||
hpl :
|
||||
ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
@ -191,7 +192,7 @@ ifdef DYNAMIC_ARCH
|
|||
endif
|
||||
|
||||
hpl_p :
|
||||
ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
@ -285,7 +286,8 @@ clean ::
|
|||
#ifdef DYNAMIC_ARCH
|
||||
@$(MAKE) -C kernel clean
|
||||
#endif
|
||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h
|
||||
@$(MAKE) -C reference clean
|
||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
||||
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
||||
@if test -d lapack-3.4.0; then \
|
||||
echo deleting lapack-3.4.0; \
|
||||
|
|
|
@ -38,33 +38,34 @@ install : lib.grd
|
|||
#for install static library
|
||||
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
|
||||
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.$(LIBSUFFIX)
|
||||
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX)
|
||||
#for install shared library
|
||||
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), Linux)
|
||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dylib
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll
|
||||
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
|
||||
endif
|
||||
|
||||
@echo Install OK!
|
||||
|
|
|
@ -3,7 +3,12 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.1alpha2.5
|
||||
VERSION = 0.1.0
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
# is libopenblas_$(LIBNAMESUFFIX).so.0.
|
||||
# LIBNAMESUFFIX = omp
|
||||
|
||||
# You can specify the target architecture, otherwise it's
|
||||
# automatically detected.
|
||||
|
@ -83,6 +88,11 @@ VERSION = 0.1alpha2.5
|
|||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
||||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
||||
# in small matrix sizes. The default value is 4.
|
||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||
|
||||
# If you need santy check by comparing reference BLAS. It'll be very
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
|
|
@ -40,6 +40,11 @@ ifdef INTERFACE64
|
|||
GETARCH_FLAGS += -DUSE64BITINT
|
||||
endif
|
||||
|
||||
ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||
GEMM_MULTITHREAD_THRESHOLD=4
|
||||
endif
|
||||
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
|
||||
|
||||
# This operation is expensive, so execution should be once.
|
||||
ifndef GOTOBLAS_MAKEFILE
|
||||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
@ -279,6 +284,11 @@ CCOMMON_OPT += -march=mips64
|
|||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
@ -342,6 +352,7 @@ endif
|
|||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
FCOMMON_OPT += -Wall
|
||||
EXTRALIB += -lgfortran
|
||||
ifdef NO_BINARY_MODE
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifdef BINARY64
|
||||
|
@ -528,8 +539,10 @@ ifdef SMP
|
|||
CCOMMON_OPT += -DSMP_SERVER
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
@ -568,7 +581,11 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
|||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifndef LIBNAMESUFFIX
|
||||
LIBPREFIX = libopenblas
|
||||
else
|
||||
LIBPREFIX = libopenblas_$(LIBNAMESUFFIX)
|
||||
endif
|
||||
|
||||
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
||||
|
||||
|
@ -590,9 +607,11 @@ endif
|
|||
|
||||
ifneq ($(ARCH), x86_64)
|
||||
ifneq ($(ARCH), x86)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
NO_AFFINITY = 1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef NO_AFFINITY
|
||||
CCOMMON_OPT += -DNO_AFFINITY
|
||||
|
@ -636,6 +655,7 @@ MD5SUM = md5sum
|
|||
AWK = awk
|
||||
|
||||
REVISION = -r$(VERSION)
|
||||
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
|
||||
|
||||
CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||
PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||
|
|
1
README
1
README
|
@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
|
|||
9.Known Issues
|
||||
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
|
||||
is 64. On 32 bits, it is 32.
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
|
||||
|
||||
10. Specification of Git Branches
|
||||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
||||
|
|
|
@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...);
|
|||
static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
unsigned long *nodemask, unsigned long maxnode,
|
||||
unsigned flags) {
|
||||
#if defined (LOONGSON3B)
|
||||
#if defined (__64BIT__)
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#else
|
||||
return 0; //NULL Implementation on Loongson 3B 32bit.
|
||||
#endif
|
||||
#else
|
||||
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
|
||||
unsigned long null_nodemask=0;
|
||||
return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags);
|
||||
// unsigned long null_nodemask=0;
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||
|
|
|
@ -2127,7 +2127,9 @@
|
|||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sgemm_p;
|
||||
extern BLASLONG sgemm_q;
|
||||
extern BLASLONG sgemm_r;
|
||||
|
|
|
@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){
|
|||
|
||||
static inline unsigned int rpcc(void){
|
||||
unsigned long ret;
|
||||
#if defined(LOONGSON3A)
|
||||
unsigned long long tmp;
|
||||
__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
|
||||
ret=tmp;
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
// unsigned long long tmp;
|
||||
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
|
||||
//ret=tmp;
|
||||
__asm__ __volatile__(".set push \n"
|
||||
".set mips32r2\n"
|
||||
"rdhwr %0, $2\n"
|
||||
".set pop": "=r"(ret):: "memory");
|
||||
|
||||
#else
|
||||
__asm__ __volatile__(".set push \n"
|
||||
".set mips32r2\n"
|
||||
|
@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){
|
|||
return ret;
|
||||
}
|
||||
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
#ifndef NO_AFFINITY
|
||||
#define WHEREAMI
|
||||
static inline int WhereAmI(void){
|
||||
int ret=0;
|
||||
__asm__ __volatile__(".set push \n"
|
||||
".set mips32r2\n"
|
||||
"rdhwr %0, $0\n"
|
||||
".set pop": "=r"(ret):: "memory");
|
||||
return ret;
|
||||
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
@ -152,6 +172,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define CMPEQ c.eq.d
|
||||
#define CMPLE c.le.d
|
||||
#define CMPLT c.lt.d
|
||||
#define NEG neg.d
|
||||
#else
|
||||
#define LD lwc1
|
||||
#define ST swc1
|
||||
|
@ -170,6 +191,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define CMPEQ c.eq.s
|
||||
#define CMPLE c.le.s
|
||||
#define CMPLT c.lt.s
|
||||
#define PLU plu.ps
|
||||
#define PLL pll.ps
|
||||
#define PUU puu.ps
|
||||
#define PUL pul.ps
|
||||
#define MADPS madd.ps
|
||||
#define CVTU cvt.s.pu
|
||||
#define CVTL cvt.s.pl
|
||||
#define NEG neg.s
|
||||
#endif
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
|
@ -218,13 +247,18 @@ REALNAME: ;\
|
|||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 8 << 20)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#define PAGESIZE (32UL << 10)
|
||||
#define FIXED_PAGESIZE (32UL << 10)
|
||||
#endif
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE (64UL << 10)
|
||||
#endif
|
||||
|
@ -236,7 +270,7 @@ REALNAME: ;\
|
|||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
#define PREFETCHD_(x) ld $0, x
|
||||
#define PREFETCHD(x) PREFETCHD_(x)
|
||||
#else
|
||||
|
|
19
cpuid_mips.c
19
cpuid_mips.c
|
@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3A 2
|
||||
#define CPU_LOONGSON3B 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A"
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
@ -101,6 +103,8 @@ int detect(void){
|
|||
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
|
@ -130,6 +134,8 @@ void get_architecture(void){
|
|||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("LOONGSON3A");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("LOONGSON3B");
|
||||
}else{
|
||||
printf("SICORTEX");
|
||||
}
|
||||
|
@ -149,6 +155,15 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("#define LOONGSON3B\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else{
|
||||
printf("#define SICORTEX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
|
@ -164,6 +179,8 @@ void get_cpuconfig(void){
|
|||
void get_libname(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("loongson3a\n");
|
||||
}else if(detect()==CPU_LOONGSON3B) {
|
||||
printf("loongson3b\n");
|
||||
}else{
|
||||
#ifdef __mips64
|
||||
printf("mips64\n");
|
||||
|
|
|
@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
range_M[0] = 0;
|
||||
i = arg -> m;
|
||||
} else {
|
||||
range_M[0] = range_M[0];
|
||||
i = range_M[1] - range_M[0];
|
||||
range_M[0] = range_m[0];
|
||||
i = range_m[1] - range_m[0];
|
||||
}
|
||||
|
||||
num_cpu_m = 0;
|
||||
|
|
|
@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
queue[num_cpu].args = arg;
|
||||
queue[num_cpu].range_m = range_m;
|
||||
queue[num_cpu].range_n = &range[num_cpu];
|
||||
#if defined(LOONGSON3A)
|
||||
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
|
||||
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
|
||||
#else
|
||||
queue[num_cpu].sa = NULL;
|
||||
queue[num_cpu].sb = NULL;
|
||||
#endif
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
num_cpu ++;
|
||||
}
|
||||
|
||||
if (num_cpu) {
|
||||
#if defined(LOONGSON3A)
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
|
||||
#else
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sb;
|
||||
|
||||
#endif
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas(num_cpu,
|
||||
|
|
|
@ -55,8 +55,8 @@ int CNAME(int mode,
|
|||
range_M[0] = 0;
|
||||
i = arg -> m;
|
||||
} else {
|
||||
range_M[0] = range_M[0];
|
||||
i = range_M[1] - range_M[0];
|
||||
range_M[0] = range_m[0];
|
||||
i = range_m[1] - range_m[0];
|
||||
}
|
||||
|
||||
num_cpu_m = 0;
|
||||
|
|
|
@ -500,6 +500,7 @@ static int blas_monitor(void *arg){
|
|||
/* Initializing routine */
|
||||
int blas_thread_init(void){
|
||||
BLASLONG i;
|
||||
int ret;
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_t attr;
|
||||
#endif
|
||||
|
@ -545,12 +546,16 @@ int blas_thread_init(void){
|
|||
pthread_cond_init (&thread_status[i].wakeup, NULL);
|
||||
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_create(&blas_threads[i], &attr,
|
||||
ret=pthread_create(&blas_threads[i], &attr,
|
||||
(void *)&blas_thread_server, (void *)i);
|
||||
#else
|
||||
pthread_create(&blas_threads[i], NULL,
|
||||
ret=pthread_create(&blas_threads[i], NULL,
|
||||
(void *)&blas_thread_server, (void *)i);
|
||||
#endif
|
||||
if(ret!=0){
|
||||
fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MONITOR
|
||||
|
@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
blas_cpu_number = num_threads;
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void openblas_set_num_threads(int num_threads) {
|
||||
|
|
|
@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
||||
}
|
||||
void openblas_set_num_threads(int num_threads) {
|
||||
|
||||
|
|
|
@ -390,11 +390,11 @@ static void *alloc_mmap(void *address){
|
|||
|
||||
#ifdef OS_LINUX
|
||||
#ifdef DEBUG
|
||||
int ret;
|
||||
int ret=0;
|
||||
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
|
||||
if(ret==-1){
|
||||
int errsv=errno;
|
||||
perror("alloc_mmap:");
|
||||
perror("OpenBLAS alloc_mmap:");
|
||||
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
|
||||
}
|
||||
|
||||
|
@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){
|
|||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
|
|
@ -45,8 +45,22 @@ int get_L2_size(void);
|
|||
#define DEFAULT_GEMM_P 128
|
||||
#define DEFAULT_GEMM_Q 128
|
||||
#define DEFAULT_GEMM_R 128
|
||||
#define DEFAULT_GEMM_OFFSET_A 0
|
||||
#define DEFAULT_GEMM_OFFSET_B 0
|
||||
|
||||
/* Global Parameter */
|
||||
#if GEMM_OFFSET_A == gemm_offset_a
|
||||
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
|
||||
#else
|
||||
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
|
||||
#endif
|
||||
|
||||
#if GEMM_OFFSET_B == gemm_offset_b
|
||||
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
|
||||
#else
|
||||
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
||||
#endif
|
||||
|
||||
#if SGEMM_P == sgemm_p
|
||||
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
||||
#else
|
||||
|
@ -666,3 +680,36 @@ void blas_set_parameter(void){
|
|||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
void blas_set_parameter(void){
|
||||
#if defined(LOONGSON3A)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1){
|
||||
#endif
|
||||
//single thread
|
||||
dgemm_r = 1024;
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
dgemm_r = 200;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1 || blas_num_threads == 2){
|
||||
#endif
|
||||
//single thread
|
||||
dgemm_r = 640;
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
dgemm_r = 160;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -58,16 +58,16 @@ dll : ../$(LIBDLLNAME)
|
|||
|
||||
dll2 : libgoto2_shared.dll
|
||||
|
||||
../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
|
||||
../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
|
||||
$(RANLIB) ../$(LIBNAME)
|
||||
ifeq ($(BINARY32), 1)
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
||||
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
|
||||
-lib /machine:i386 /def:libgoto2.def
|
||||
-lib /machine:i386 /def:libopenblas.def
|
||||
else
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
|
||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
||||
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
|
||||
-lib /machine:X64 /def:libgoto2.def
|
||||
-lib /machine:X64 /def:libopenblas.def
|
||||
endif
|
||||
|
||||
libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
|
||||
|
@ -75,7 +75,7 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
|
|||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
|
||||
|
||||
libgoto2.def : gensymbol
|
||||
libopenblas.def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
|
||||
|
||||
libgoto2_shared.def : gensymbol
|
||||
|
@ -100,7 +100,7 @@ so : ../$(LIBSONAME)
|
|||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
||||
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||
-Wl,--retain-symbols-file=linux.def $(EXTRALIB)
|
||||
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
|
||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
|
|
|
@ -301,7 +301,7 @@
|
|||
if ($ARGV[5] == 1) {
|
||||
#NO_LAPACK=1
|
||||
@objs = (@blasobjs);
|
||||
} elsif (-d "../lapack-3.1.1") {
|
||||
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") {
|
||||
@objs = (@blasobjs, @lapackobjs, @lapackobjs2);
|
||||
} else {
|
||||
@objs = (@blasobjs, @lapackobjs);
|
||||
|
@ -389,6 +389,13 @@ if ($ARGV[0] eq "win2k"){
|
|||
$count ++;
|
||||
}
|
||||
|
||||
if ($ARGV[4] == 0) {
|
||||
foreach $objs (@cblasobjs) {
|
||||
print "\t",$objs,"=$objs"," \@", $count, "\n";
|
||||
$count ++;
|
||||
}
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
|
4
f_check
4
f_check
|
@ -284,6 +284,10 @@ if ($link ne "") {
|
|||
|
||||
}
|
||||
|
||||
if ($vendor eq "INTEL"){
|
||||
$linker_a .= "-lgfortran"
|
||||
}
|
||||
|
||||
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
|
||||
open(CONFFILE, ">> $config" ) || die "Can't append $config";
|
||||
|
||||
|
|
15
getarch.c
15
getarch.c
|
@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* #define FORCE_CELL */
|
||||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3A */
|
||||
/* #define FORCE_LOONGSON3B */
|
||||
/* #define FORCE_ITANIUM2 */
|
||||
/* #define FORCE_GENERIC */
|
||||
/* #define FORCE_SPARC */
|
||||
|
@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSON3B
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "LOONGSON3B"
|
||||
#define SUBDIRNAME "mips64"
|
||||
#define ARCHCONFIG "-DLOONGSON3B " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "loongson3b"
|
||||
#define CORENAME "LOONGSON3B"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ITANIUM2
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "IA64"
|
||||
|
|
|
@ -34,6 +34,7 @@ int main(int argc, char **argv) {
|
|||
#ifdef USE64BITINT
|
||||
printf("#define USE64BITINT\n");
|
||||
#endif
|
||||
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -770,20 +770,36 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c
|
|||
xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
|
||||
$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
|
||||
|
||||
ifndef USE_NETLIB_GEMV
|
||||
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
||||
dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
else
|
||||
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
|
||||
dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
endif
|
||||
|
||||
qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
||||
ifndef USE_NETLIB_GEMV
|
||||
cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
||||
zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
else
|
||||
cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
|
||||
zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f
|
||||
$(FC) -c $(FFLAGS) -o $(@F) $<
|
||||
endif
|
||||
|
||||
xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c
|
||||
$(CC) -c $(CFLAGS) -o $(@F) $<
|
||||
|
|
|
@ -397,8 +397,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
||||
if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|
||||
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
|
||||
args.nthreads = 1;
|
||||
}else{
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
}
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,285 @@
|
|||
SUBROUTINE CGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
COMPLEX ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
COMPLEX A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* CGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or
|
||||
*
|
||||
* y := alpha*A**H*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - COMPLEX .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - COMPLEX array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - COMPLEX array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - COMPLEX .
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - COMPLEX array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
COMPLEX ONE
|
||||
PARAMETER (ONE= (1.0E+0,0.0E+0))
|
||||
COMPLEX ZERO
|
||||
PARAMETER (ZERO= (0.0E+0,0.0E+0))
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
COMPLEX TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
LOGICAL NOCONJ
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC CONJG,MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('CGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
NOCONJ = LSAME(TRANS,'T')
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 110 J = 1,N
|
||||
TEMP = ZERO
|
||||
IF (NOCONJ) THEN
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
ELSE
|
||||
DO 100 I = 1,M
|
||||
TEMP = TEMP + CONJG(A(I,J))*X(I)
|
||||
100 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
110 CONTINUE
|
||||
ELSE
|
||||
DO 140 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
IF (NOCONJ) THEN
|
||||
DO 120 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
120 CONTINUE
|
||||
ELSE
|
||||
DO 130 I = 1,M
|
||||
TEMP = TEMP + CONJG(A(I,J))*X(IX)
|
||||
IX = IX + INCX
|
||||
130 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
140 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of CGEMV .
|
||||
*
|
||||
END
|
|
@ -0,0 +1,265 @@
|
|||
SUBROUTINE DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE PRECISION ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* DGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - DOUBLE PRECISION.
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - DOUBLE PRECISION array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - DOUBLE PRECISION.
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - DOUBLE PRECISION array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ONE,ZERO
|
||||
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('DGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP = ZERO
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
DO 120 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
DO 110 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of DGEMV .
|
||||
*
|
||||
END
|
|
@ -0,0 +1,265 @@
|
|||
SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
REAL ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
REAL A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* SGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - REAL .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - REAL array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - REAL array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - REAL .
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - REAL array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
REAL ONE,ZERO
|
||||
PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
REAL TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('SGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP = ZERO
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
DO 120 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
DO 110 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of SGEMV .
|
||||
*
|
||||
END
|
|
@ -0,0 +1,285 @@
|
|||
SUBROUTINE ZGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE COMPLEX ALPHA,BETA
|
||||
INTEGER INCX,INCY,LDA,M,N
|
||||
CHARACTER TRANS
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE COMPLEX A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* ZGEMV performs one of the matrix-vector operations
|
||||
*
|
||||
* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or
|
||||
*
|
||||
* y := alpha*A**H*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are vectors and A is an
|
||||
* m by n matrix.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||||
*
|
||||
* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y.
|
||||
*
|
||||
* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* M - INTEGER.
|
||||
* On entry, M specifies the number of rows of the matrix A.
|
||||
* M must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the number of columns of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - COMPLEX*16 .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - COMPLEX*16 array of DIMENSION ( LDA, n ).
|
||||
* Before entry, the leading m by n part of the array A must
|
||||
* contain the matrix of coefficients.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* max( 1, m ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - COMPLEX*16 array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - COMPLEX*16 .
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - COMPLEX*16 array of DIMENSION at least
|
||||
* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||||
* and at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||||
* Before entry with BETA non-zero, the incremented array Y
|
||||
* must contain the vector y. On exit, Y is overwritten by the
|
||||
* updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
* The vector and matrix arguments are not referenced when N = 0, or M = 0
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE COMPLEX ONE
|
||||
PARAMETER (ONE= (1.0D+0,0.0D+0))
|
||||
DOUBLE COMPLEX ZERO
|
||||
PARAMETER (ZERO= (0.0D+0,0.0D+0))
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE COMPLEX TEMP
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY
|
||||
LOGICAL NOCONJ
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC DCONJG,MAX
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (M.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT.MAX(1,M)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('ZGEMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
|
||||
+ ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
NOCONJ = LSAME(TRANS,'T')
|
||||
*
|
||||
* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||||
* up the start points in X and Y.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
LENX = N
|
||||
LENY = M
|
||||
ELSE
|
||||
LENX = M
|
||||
LENY = N
|
||||
END IF
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (LENX-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (LENY-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,LENY
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,LENY
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,LENY
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,LENY
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form y := alpha*A*x + y.
|
||||
*
|
||||
JX = KX
|
||||
IF (INCY.EQ.1) THEN
|
||||
DO 60 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
DO 50 I = 1,M
|
||||
Y(I) = Y(I) + TEMP*A(I,J)
|
||||
50 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
DO 80 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = ALPHA*X(JX)
|
||||
IY = KY
|
||||
DO 70 I = 1,M
|
||||
Y(IY) = Y(IY) + TEMP*A(I,J)
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y.
|
||||
*
|
||||
JY = KY
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 110 J = 1,N
|
||||
TEMP = ZERO
|
||||
IF (NOCONJ) THEN
|
||||
DO 90 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(I)
|
||||
90 CONTINUE
|
||||
ELSE
|
||||
DO 100 I = 1,M
|
||||
TEMP = TEMP + DCONJG(A(I,J))*X(I)
|
||||
100 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
110 CONTINUE
|
||||
ELSE
|
||||
DO 140 J = 1,N
|
||||
TEMP = ZERO
|
||||
IX = KX
|
||||
IF (NOCONJ) THEN
|
||||
DO 120 I = 1,M
|
||||
TEMP = TEMP + A(I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
120 CONTINUE
|
||||
ELSE
|
||||
DO 130 I = 1,M
|
||||
TEMP = TEMP + DCONJG(A(I,J))*X(IX)
|
||||
IX = IX + INCX
|
||||
130 CONTINUE
|
||||
END IF
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP
|
||||
JY = JY + INCY
|
||||
140 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of ZGEMV .
|
||||
*
|
||||
END
|
|
@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO,
|
|||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
|
@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO,
|
|||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
|
@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
|
@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
|
|
|
@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
|||
$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
|
||||
ifeq ($(TARGET), LOONGSON3B)
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
||||
|
@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
|||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
#include "common.h"
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
for (k=0; k<bk/4; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*1+0];
|
||||
load5 = ptrbb[2*1+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*1+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*1+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
load0 = ptrba[2*2+0];
|
||||
load1 = ptrbb[2*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*2+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*3+0];
|
||||
load5 = ptrbb[2*3+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*3+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*3+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
for (k=0; k<(bk&3); k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = C0[1]+res1;
|
||||
res2 = res2*alpha;
|
||||
C1[0] = C1[0]+res2;
|
||||
res3 = res3*alpha;
|
||||
C1[1] = C1[1]+res3;
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrbb[2*0+1];
|
||||
res1 = res1+load0*load2;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
res1 = res1*alpha;
|
||||
C1[0] = C1[0]+res1;
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
}
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = C0[1]+res1;
|
||||
C0 = C0+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = C0[0]+res0;
|
||||
C0 = C0+1;
|
||||
}
|
||||
k = (bk<<0);
|
||||
bb = bb+k;
|
||||
C = C+ldc;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,280 @@
|
|||
#include "common.h"
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
|
||||
BLASLONG off, temp;
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#endif
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || \
|
||||
(!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
for (k=0; k<temp/4; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*1+0];
|
||||
load5 = ptrbb[2*1+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*1+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*1+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
load0 = ptrba[2*2+0];
|
||||
load1 = ptrbb[2*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*2+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
load4 = ptrba[2*3+0];
|
||||
load5 = ptrbb[2*3+0];
|
||||
res0 = res0+load4*load5;
|
||||
load6 = ptrba[2*3+1];
|
||||
res1 = res1+load6*load5;
|
||||
load7 = ptrbb[2*3+1];
|
||||
res2 = res2+load4*load7;
|
||||
res3 = res3+load6*load7;
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
for (k=0; k<(temp&3); k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res2 = res2+load0*load3;
|
||||
res3 = res3+load2*load3;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = res1;
|
||||
res2 = res2*alpha;
|
||||
C1[0] = res2;
|
||||
res3 = res3*alpha;
|
||||
C1[1] = res3;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off;
|
||||
ptrbb = bb+off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrbb[2*0+1];
|
||||
res1 = res1+load0*load2;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
res1 = res1*alpha;
|
||||
C1[0] = res1;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || \
|
||||
(!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2;
|
||||
#else
|
||||
temp = off+1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
res1 = res1*alpha;
|
||||
C0[1] = res1;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
C0 = C0+2;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off;
|
||||
ptrbb = bb+off;
|
||||
#endif
|
||||
res0 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 1;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
load0 = ptrba[0+0];
|
||||
load1 = ptrbb[0+0];
|
||||
res0 = res0+load0*load1;
|
||||
ptrba = ptrba+1;
|
||||
ptrbb = ptrbb+1;
|
||||
}
|
||||
res0 = res0*alpha;
|
||||
C0[0] = res0;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp;
|
||||
ptrbb += temp;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+1;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1;
|
||||
#endif
|
||||
k = (bk<<0);
|
||||
bb = bb+k;
|
||||
C = C+ldc;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,838 @@
|
|||
#include "common.h"
|
||||
/********************************
|
||||
ADD1 a*c
|
||||
ADD2 b*c
|
||||
ADD3 a*d
|
||||
ADD4 b*d
|
||||
*********************************/
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
, BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+2*ldc;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
res4 = 0;
|
||||
res5 = 0;
|
||||
res6 = 0;
|
||||
res7 = 0;
|
||||
for (k=0; k<bk/4; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
ptrba = ptrba+16;
|
||||
ptrbb = ptrbb+16;
|
||||
}
|
||||
for (k=0; k<(bk&3); k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
load2 = res2*alphar;
|
||||
C0[2] = C0[2]+load2;
|
||||
load3 = res3*alphar;
|
||||
C0[3] = C0[3]+load3;
|
||||
load2 = res3*alphai;
|
||||
C0[2] = C0[2]-load2;
|
||||
load3 = res2*alphai;
|
||||
C0[3] = C0[3]+load3;
|
||||
load4 = res4*alphar;
|
||||
C1[0] = C1[0]+load4;
|
||||
load5 = res5*alphar;
|
||||
C1[1] = C1[1]+load5;
|
||||
load4 = res5*alphai;
|
||||
C1[0] = C1[0]-load4;
|
||||
load5 = res4*alphai;
|
||||
C1[1] = C1[1]+load5;
|
||||
load6 = res6*alphar;
|
||||
C1[2] = C1[2]+load6;
|
||||
load7 = res7*alphar;
|
||||
C1[3] = C1[3]+load7;
|
||||
load6 = res7*alphai;
|
||||
C1[2] = C1[2]-load6;
|
||||
load7 = res6*alphai;
|
||||
C1[3] = C1[3]+load7;
|
||||
C0 = C0+4;
|
||||
C1 = C1+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
load2 = res2*alphar;
|
||||
C1[0] = C1[0]+load2;
|
||||
load3 = res3*alphar;
|
||||
C1[1] = C1[1]+load3;
|
||||
load2 = res3*alphai;
|
||||
C1[0] = C1[0]-load2;
|
||||
load3 = res2*alphai;
|
||||
C1[1] = C1[1]+load3;
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
k = (bk<<2);
|
||||
bb = bb+k;
|
||||
i = (ldc<<2);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
load2 = res2*alphar;
|
||||
C0[2] = C0[2]+load2;
|
||||
load3 = res3*alphar;
|
||||
C0[3] = C0[3]+load3;
|
||||
load2 = res3*alphai;
|
||||
C0[2] = C0[2]-load2;
|
||||
load3 = res2*alphai;
|
||||
C0[3] = C0[3]+load3;
|
||||
C0 = C0+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
ptrbb = bb;
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
for (k=0; k<bk; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar;
|
||||
C0[0] = C0[0]+load0;
|
||||
load1 = res1*alphar;
|
||||
C0[1] = C0[1]+load1;
|
||||
load0 = res1*alphai;
|
||||
C0[0] = C0[0]-load0;
|
||||
load1 = res0*alphai;
|
||||
C0[1] = C0[1]+load1;
|
||||
C0 = C0+2;
|
||||
}
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,923 @@
|
|||
#include "common.h"
|
||||
/********************************
|
||||
ADD1 a*c
|
||||
ADD2 b*c
|
||||
ADD3 a*d
|
||||
ADD4 b*d
|
||||
*********************************/
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
|
||||
FLOAT* C,BLASLONG ldc, BLASLONG offset)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
|
||||
BLASLONG off, temp;
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#endif
|
||||
for (j=0; j<bn/2; j+=1)
|
||||
{
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
C0 = C;
|
||||
C1 = C0+2*ldc;
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb+off*2*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
res4 = 0;
|
||||
res5 = 0;
|
||||
res6 = 0;
|
||||
res7 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 2;
|
||||
#else
|
||||
temp = off + 2;
|
||||
#endif
|
||||
for (k=0; k<temp/4; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1+load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3+load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5+load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7+load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0+load10*load11;
|
||||
res1 = res1+load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2+load13*load11;
|
||||
res3 = res3+load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4+load10*load15;
|
||||
res5 = res5+load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6+load13*load15;
|
||||
res7 = res7+load12*load15;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*1+0];
|
||||
load9 = ptrbb[4*1+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*1+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*1+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*1+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*1+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*1+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*1+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
load0 = ptrba[4*2+0];
|
||||
load1 = ptrbb[4*2+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*2+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*2+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*2+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*2+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*2+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*2+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
load8 = ptrba[4*3+0];
|
||||
load9 = ptrbb[4*3+0];
|
||||
res0 = res0+load8*load9;
|
||||
load10 = ptrba[4*3+1];
|
||||
res1 = res1-load10*load9;
|
||||
load11 = ptrbb[4*3+1];
|
||||
res0 = res0-load10*load11;
|
||||
res1 = res1-load8*load11;
|
||||
load12 = ptrba[4*3+2];
|
||||
res2 = res2+load12*load9;
|
||||
load13 = ptrba[4*3+3];
|
||||
res3 = res3-load13*load9;
|
||||
res2 = res2-load13*load11;
|
||||
res3 = res3-load12*load11;
|
||||
load14 = ptrbb[4*3+2];
|
||||
res4 = res4+load8*load14;
|
||||
res5 = res5-load10*load14;
|
||||
load15 = ptrbb[4*3+3];
|
||||
res4 = res4-load10*load15;
|
||||
res5 = res5-load8*load15;
|
||||
res6 = res6+load12*load14;
|
||||
res7 = res7-load13*load14;
|
||||
res6 = res6-load13*load15;
|
||||
res7 = res7-load12*load15;
|
||||
#endif
|
||||
ptrba = ptrba+16;
|
||||
ptrbb = ptrbb+16;
|
||||
}
|
||||
for (k=0; k<(temp&3); k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5+load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7+load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4+load2*load7;
|
||||
res5 = res5+load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6+load5*load7;
|
||||
res7 = res7+load4*load7;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
load6 = ptrbb[4*0+2];
|
||||
res4 = res4+load0*load6;
|
||||
res5 = res5-load2*load6;
|
||||
load7 = ptrbb[4*0+3];
|
||||
res4 = res4-load2*load7;
|
||||
res5 = res5-load0*load7;
|
||||
res6 = res6+load4*load6;
|
||||
res7 = res7-load5*load6;
|
||||
res6 = res6-load5*load7;
|
||||
res7 = res7-load4*load7;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
|
||||
load2 = res2*alphar-res3*alphai;
|
||||
load3 = res3*alphar+res2*alphai;
|
||||
C0[2] = load2;
|
||||
C0[3] = load3;
|
||||
|
||||
load4 = res4*alphar-res5*alphai;
|
||||
load5 = res5*alphar+res4*alphai;
|
||||
C1[0] = load4;
|
||||
C1[1] = load5;
|
||||
|
||||
load6 = res6*alphar-res7*alphai;
|
||||
load7 = res7*alphar+res6*alphai;
|
||||
C1[2] = load6;
|
||||
C1[3] = load7;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
C0 = C0+4;
|
||||
C1 = C1+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3+load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2+load2*load5;
|
||||
res3 = res3+load0*load5;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[4*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[4*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrbb[4*0+2];
|
||||
res2 = res2+load0*load4;
|
||||
res3 = res3-load2*load4;
|
||||
load5 = ptrbb[4*0+3];
|
||||
res2 = res2-load2*load5;
|
||||
res3 = res3-load0*load5;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
|
||||
load2 = res2*alphar-res3*alphai;
|
||||
load3 = res3*alphar+res2*alphai;
|
||||
C1[0] = load2;
|
||||
C1[1] = load3;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
k = (bk<<2);
|
||||
bb = bb+k;
|
||||
i = (ldc<<2);
|
||||
C = C+i;
|
||||
}
|
||||
for (j=0; j<(bn&1); j+=1)
|
||||
{
|
||||
C0 = C;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
for (i=0; i<bm/2; i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb+off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
res2 = 0;
|
||||
res3 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 2;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3+load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2+load5*load3;
|
||||
res3 = res3+load4*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[4*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[4*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
load4 = ptrba[4*0+2];
|
||||
res2 = res2+load4*load1;
|
||||
load5 = ptrba[4*0+3];
|
||||
res3 = res3-load5*load1;
|
||||
res2 = res2-load5*load3;
|
||||
res3 = res3-load4*load3;
|
||||
#endif
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
|
||||
load2 = res2*alphar-res3*alphai;
|
||||
load3 = res3*alphar+res2*alphai;
|
||||
C0[2] = load2;
|
||||
C0[3] = load3;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
C0 = C0+4;
|
||||
}
|
||||
for (i=0; i<(bm&1); i+=1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
res0 = 0;
|
||||
res1 = 0;
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 1;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
for (k=0; k<temp; k+=1)
|
||||
{
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1+load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0+load2*load3;
|
||||
res1 = res1+load0*load3;
|
||||
#endif
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
load0 = ptrba[2*0+0];
|
||||
load1 = ptrbb[2*0+0];
|
||||
res0 = res0+load0*load1;
|
||||
load2 = ptrba[2*0+1];
|
||||
res1 = res1-load2*load1;
|
||||
load3 = ptrbb[2*0+1];
|
||||
res0 = res0-load2*load3;
|
||||
res1 = res1-load0*load3;
|
||||
#endif
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
load0 = res0*alphar-res1*alphai;
|
||||
load1 = res1*alphar+res0*alphai;
|
||||
C0[0] = load0;
|
||||
C0[1] = load1;
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
}
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT
|
|||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LN
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LT
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RN
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RT
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LN
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LT
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RN
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RT
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
|
|
|
@ -1,18 +1,48 @@
|
|||
SAXPYKERNEL=axpy_loongson3a.S
|
||||
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_loongson3a.S
|
||||
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = gemm_kernel_loongson3a.S
|
||||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
|
@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
SAXPYKERNEL=axpy_loongson3a.S
|
||||
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,101 @@
|
|||
#include "common.h"
|
||||
|
||||
//These are auto-tuning codes on Loongson-3A platform.
|
||||
|
||||
//#define prefetch(x) __builtin_prefetch(x)
|
||||
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
|
||||
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
|
||||
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
|
||||
{
|
||||
|
||||
BLASLONG kx=0, ky=0;
|
||||
if(!ALPHA)
|
||||
return 0;
|
||||
|
||||
//if(INCX < 0)
|
||||
// kx = (1-N) * INCX;
|
||||
// INCX = -INCX;
|
||||
//if(INCY < 0)
|
||||
// ky = (1-M) * INCY;
|
||||
// INCY = -INCY;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 4;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0;
|
||||
|
||||
if(ALPHA == 1) {
|
||||
if(INCY == 1) {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[i + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0, h = ky;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[h + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(INCY == 1) {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[i + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0, h = ky;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[h + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
#include "common.h"
|
||||
|
||||
//These are auto-tuning codes on Loongson-3A platform.
|
||||
|
||||
//#define prefetch(x) __builtin_prefetch(x)
|
||||
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
|
||||
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
|
||||
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!ALPHA)
|
||||
return 0;
|
||||
|
||||
// if(INCX < 0)
|
||||
// INCX = -INCX;
|
||||
// if(INCY < 0)
|
||||
// INCY = -INCY;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 3;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0;
|
||||
|
||||
if(ALPHA == 1) {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[i + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0, h = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[h + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[i + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0, h = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[h + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,139 @@
|
|||
#include "common.h"
|
||||
|
||||
//typedef int BLASLONG;
|
||||
//typedef double FLOAT;
|
||||
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||
#define spec_loop spec_loop_0
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||
#define norm_loop norm_loop_0
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||
#define spec_loop spec_loop_1
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||
#define norm_loop norm_loop_1
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||
#define spec_loop spec_loop_2
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||
#define norm_loop norm_loop_2
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||
#define spec_loop spec_loop_3
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||
#define norm_loop norm_loop_3
|
||||
#endif
|
||||
|
||||
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!rALPHA && iALPHA)
|
||||
return 0;
|
||||
|
||||
BLASLONG fahead = 60;
|
||||
BLASLONG spec_unroll = 2;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0, jj = 0;
|
||||
|
||||
if(rALPHA == 1 && iALPHA == 0) {
|
||||
if(INCY == 1) {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FLOAT rTmp, iTmp;
|
||||
if(INCY == 1) {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
#include "common.h"
|
||||
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||
#define spec_loop spec_loop_0
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||
#define norm_loop norm_loop_0
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||
#define spec_loop spec_loop_1
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||
#define norm_loop norm_loop_1
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||
#define spec_loop spec_loop_2
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||
#define norm_loop norm_loop_2
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||
#define spec_loop spec_loop_3
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||
#define norm_loop norm_loop_3
|
||||
#endif
|
||||
|
||||
|
||||
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
|
||||
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!rALPHA && iALPHA)
|
||||
return 0;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 2;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0, jj = 0;
|
||||
|
||||
if(rALPHA == 1 && iALPHA == 0) {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FLOAT rTmp, iTmp;
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -239,6 +239,22 @@ ifndef ZSWAPKERNEL
|
|||
ZSWAPKERNEL = zswap_sse2.S
|
||||
endif
|
||||
|
||||
ifndef DGEMVNKERNEL
|
||||
DGEMVNKERNEL = gemv_n_sse2.S
|
||||
endif
|
||||
|
||||
ifndef DGEMVTKERNEL
|
||||
DGEMVTKERNEL = gemv_t_sse2.S
|
||||
endif
|
||||
|
||||
ifndef ZGEMVNKERNEL
|
||||
ZGEMVNKERNEL = zgemv_n_sse2.S
|
||||
endif
|
||||
|
||||
ifndef ZGEMVTKERNEL
|
||||
ZGEMVTKERNEL = zgemv_t_sse2.S
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
|
|
79
param.h
79
param.h
|
@ -1480,31 +1480,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 1
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 1
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 32
|
||||
#define DGEMM_DEFAULT_P 32
|
||||
#define CGEMM_DEFAULT_P 108
|
||||
#define ZGEMM_DEFAULT_P 112
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_Q 116
|
||||
#define DGEMM_DEFAULT_Q 116
|
||||
#define CGEMM_DEFAULT_Q 144
|
||||
#define ZGEMM_DEFAULT_Q 72
|
||||
#define SGEMM_DEFAULT_P 64
|
||||
#define DGEMM_DEFAULT_P 44
|
||||
#define CGEMM_DEFAULT_P 64
|
||||
#define ZGEMM_DEFAULT_P 32
|
||||
|
||||
#define SGEMM_DEFAULT_R 1000
|
||||
#define DGEMM_DEFAULT_R 1000
|
||||
#define CGEMM_DEFAULT_R 2000
|
||||
#define ZGEMM_DEFAULT_R 2000
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 92
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 80
|
||||
|
||||
#define SGEMM_DEFAULT_R 640
|
||||
#define DGEMM_DEFAULT_R dgemm_r
|
||||
#define CGEMM_DEFAULT_R 640
|
||||
#define ZGEMM_DEFAULT_R 640
|
||||
|
||||
#define GEMM_OFFSET_A1 0x10000
|
||||
#define GEMM_OFFSET_B1 0x100000
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
#ifdef LOONGSON3B
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 64
|
||||
#define DGEMM_DEFAULT_P 24
|
||||
#define CGEMM_DEFAULT_P 24
|
||||
#define ZGEMM_DEFAULT_P 20
|
||||
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 64
|
||||
|
||||
#define SGEMM_DEFAULT_R 512
|
||||
#define DGEMM_DEFAULT_R 512
|
||||
#define CGEMM_DEFAULT_R 512
|
||||
#define ZGEMM_DEFAULT_R 512
|
||||
|
||||
#define GEMM_OFFSET_A1 0x10000
|
||||
#define GEMM_OFFSET_B1 0x100000
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
|
|
@ -1301,6 +1301,8 @@
|
|||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
|
|
@ -1303,6 +1303,8 @@
|
|||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
|
Loading…
Reference in New Issue