Merge branch 'loongson3b' into release-0.1.0

This commit is contained in:
Xianyi Zhang 2012-03-23 01:26:44 +08:00
commit 3871b6a86d
20 changed files with 10590 additions and 17 deletions

View File

@ -284,6 +284,11 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64
endif endif
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1 BINARY_DEFINED = 1
endif endif
@ -534,8 +539,10 @@ ifdef SMP
CCOMMON_OPT += -DSMP_SERVER CCOMMON_OPT += -DSMP_SERVER
ifeq ($(ARCH), mips64) ifeq ($(ARCH), mips64)
ifneq ($(CORE), LOONGSON3B)
USE_SIMPLE_THREADED_LEVEL3 = 1 USE_SIMPLE_THREADED_LEVEL3 = 1
endif endif
endif
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
# USE_SIMPLE_THREADED_LEVEL3 = 1 # USE_SIMPLE_THREADED_LEVEL3 = 1
@ -600,9 +607,11 @@ endif
ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86_64)
ifneq ($(ARCH), x86) ifneq ($(ARCH), x86)
ifneq ($(CORE), LOONGSON3B)
NO_AFFINITY = 1 NO_AFFINITY = 1
endif endif
endif endif
endif
ifdef NO_AFFINITY ifdef NO_AFFINITY
CCOMMON_OPT += -DNO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY

View File

@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...);
static inline int my_mbind(void *addr, unsigned long len, int mode, static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode, unsigned long *nodemask, unsigned long maxnode,
unsigned flags) { unsigned flags) {
#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#else
return 0; //NULL Implementation on Loongson 3B 32bit.
#endif
#else
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
unsigned long null_nodemask=0; unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags);
#endif
} }
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {

View File

@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){
static inline unsigned int rpcc(void){ static inline unsigned int rpcc(void){
unsigned long ret; unsigned long ret;
#if defined(LOONGSON3A) #if defined(LOONGSON3A) || defined(LOONGSON3B)
unsigned long long tmp; // unsigned long long tmp;
__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
ret=tmp; //ret=tmp;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
#else #else
__asm__ __volatile__(".set push \n" __asm__ __volatile__(".set push \n"
".set mips32r2\n" ".set mips32r2\n"
@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){
return ret; return ret;
} }
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#ifndef NO_AFFINITY
#define WHEREAMI
static inline int WhereAmI(void){
int ret=0;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $0\n"
".set pop": "=r"(ret):: "memory");
return ret;
}
#endif
#endif
static inline int blas_quickdivide(blasint x, blasint y){ static inline int blas_quickdivide(blasint x, blasint y){
return x / y; return x / y;
} }
@ -234,6 +254,11 @@ REALNAME: ;\
#define FIXED_PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10)
#endif #endif
#if defined(LOONGSON3B)
#define PAGESIZE (32UL << 10)
#define FIXED_PAGESIZE (32UL << 10)
#endif
#ifndef PAGESIZE #ifndef PAGESIZE
#define PAGESIZE (64UL << 10) #define PAGESIZE (64UL << 10)
#endif #endif
@ -245,7 +270,7 @@ REALNAME: ;\
#define MAP_ANONYMOUS MAP_ANON #define MAP_ANONYMOUS MAP_ANON
#endif #endif
#if defined(LOONGSON3A) #if defined(LOONGSON3A) || defined(LOONGSON3B)
#define PREFETCHD_(x) ld $0, x #define PREFETCHD_(x) ld $0, x
#define PREFETCHD(x) PREFETCHD_(x) #define PREFETCHD(x) PREFETCHD_(x)
#else #else

View File

@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_SICORTEX 1 #define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2 #define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKOWN",
"SICORTEX", "SICORTEX",
"LOONGSON3A" "LOONGSON3A",
"LOONGSON3B"
}; };
int detect(void){ int detect(void){
@ -101,6 +103,8 @@ int detect(void){
if (strstr(p, "Loongson-3A")){ if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A; return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){ }else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r"); infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){ while (fgets(buffer, sizeof(buffer), infile)){
@ -130,6 +134,8 @@ void get_architecture(void){
void get_subarchitecture(void){ void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) { if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A"); printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
}else{ }else{
printf("SICORTEX"); printf("SICORTEX");
} }
@ -149,6 +155,15 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n"); printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{ }else{
printf("#define SICORTEX\n"); printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
@ -164,6 +179,8 @@ void get_cpuconfig(void){
void get_libname(void){ void get_libname(void){
if(detect()==CPU_LOONGSON3A) { if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n"); printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
}else{ }else{
#ifdef __mips64 #ifdef __mips64
printf("mips64\n"); printf("mips64\n");

View File

@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
range_M[0] = 0; range_M[0] = 0;
i = arg -> m; i = arg -> m;
} else { } else {
range_M[0] = range_M[0]; range_M[0] = range_m[0];
i = range_M[1] - range_M[0]; i = range_m[1] - range_m[0];
} }
num_cpu_m = 0; num_cpu_m = 0;

View File

@ -55,8 +55,8 @@ int CNAME(int mode,
range_M[0] = 0; range_M[0] = 0;
i = arg -> m; i = arg -> m;
} else { } else {
range_M[0] = range_M[0]; range_M[0] = range_m[0];
i = range_M[1] - range_M[0]; i = range_m[1] - range_m[0];
} }
num_cpu_m = 0; num_cpu_m = 0;

View File

@ -389,12 +389,13 @@ static void *alloc_mmap(void *address){
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#ifdef OS_LINUX #ifdef OS_LINUX
#ifdef DEBUG #if 1
int ret; //#ifdef DEBUG
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){ if(ret==-1){
int errsv=errno; int errsv=errno;
perror("alloc_mmap:"); perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
} }

View File

@ -696,5 +696,20 @@ void blas_set_parameter(void){
} }
#endif #endif
#endif #endif
#if defined(LOONGSON3B)
#ifdef SMP
if(blas_num_threads == 1 || blas_num_threads == 2){
#endif
//single thread
dgemm_r = 640;
#ifdef SMP
}else{
//multi thread
dgemm_r = 160;
}
#endif
#endif
} }
#endif #endif

View File

@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_CELL */ /* #define FORCE_CELL */
/* #define FORCE_SICORTEX */ /* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */ /* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */ /* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */ /* #define FORCE_GENERIC */
/* #define FORCE_SPARC */ /* #define FORCE_SPARC */
@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#endif #endif
#ifdef FORCE_LOONGSON3B
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "LOONGSON3B"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DLOONGSON3B " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "loongson3b"
#define CORENAME "LOONGSON3B"
#else
#endif
#ifdef FORCE_ITANIUM2 #ifdef FORCE_ITANIUM2
#define FORCE #define FORCE
#define ARCHITECTURE "IA64" #define ARCHITECTURE "IA64"

View File

@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
ifeq ($(TARGET), LOONGSON3B)
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@

View File

@ -0,0 +1,157 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(bk&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
res2 = res2*alpha;
C1[0] = C1[0]+res2;
res3 = res3*alpha;
C1[1] = C1[1]+res3;
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C1[0] = C1[0]+res1;
C0 = C0+1;
C1 = C1+1;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
C0 = C0+1;
}
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,280 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+2;
#endif
for (k=0; k<temp/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(temp&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
res2 = res2*alpha;
C1[0] = res2;
res3 = res3*alpha;
C1[1] = res3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C1[0] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off;
#endif
res0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp;
ptrbb += temp;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,838 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
for (k=0; k<bk/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(bk&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
load4 = res4*alphar;
C1[0] = C1[0]+load4;
load5 = res5*alphar;
C1[1] = C1[1]+load5;
load4 = res5*alphai;
C1[0] = C1[0]-load4;
load5 = res4*alphai;
C1[1] = C1[1]+load5;
load6 = res6*alphar;
C1[2] = C1[2]+load6;
load7 = res7*alphar;
C1[3] = C1[3]+load7;
load6 = res7*alphai;
C1[2] = C1[2]-load6;
load7 = res6*alphai;
C1[3] = C1[3]+load7;
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C1[0] = C1[0]+load2;
load3 = res3*alphar;
C1[1] = C1[1]+load3;
load2 = res3*alphai;
C1[0] = C1[0]-load2;
load3 = res2*alphai;
C1[1] = C1[1]+load3;
C0 = C0+2;
C1 = C1+2;
}
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -0,0 +1,923 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
FLOAT* C,BLASLONG ldc, BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 2;
#endif
for (k=0; k<temp/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(temp&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
load4 = res4*alphar-res5*alphai;
load5 = res5*alphar+res4*alphai;
C1[0] = load4;
C1[1] = load5;
load6 = res6*alphar-res7*alphai;
load7 = res7*alphar+res6*alphai;
C1[2] = load6;
C1[3] = load7;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C1[0] = load2;
C1[1] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
C1 = C1+2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -0,0 +1,64 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

47
param.h
View File

@ -1502,10 +1502,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 80 #define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 1024 #define SGEMM_DEFAULT_R 640
#define DGEMM_DEFAULT_R dgemm_r #define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R 1024 #define CGEMM_DEFAULT_R 640
#define ZGEMM_DEFAULT_R 1024 #define ZGEMM_DEFAULT_R 640
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16
#endif
#ifdef LOONGSON3B
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 24
#define CGEMM_DEFAULT_P 24
#define ZGEMM_DEFAULT_P 20
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 64
#define SGEMM_DEFAULT_R 512
#define DGEMM_DEFAULT_R 512
#define CGEMM_DEFAULT_R 512
#define ZGEMM_DEFAULT_R 512
#define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000 #define GEMM_OFFSET_B1 0x100000