Merge branch 'loongson3b' into release-0.1.0

This commit is contained in:
Xianyi Zhang 2012-03-23 01:26:44 +08:00
commit 3871b6a86d
20 changed files with 10590 additions and 17 deletions

View File

@ -279,7 +279,12 @@ endif
BINARY_DEFINED = 1
endif
ifeq ($(CORE), LOONGSON3A)
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
@ -534,8 +539,10 @@ ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
ifeq ($(ARCH), mips64)
ifneq ($(CORE), LOONGSON3B)
USE_SIMPLE_THREADED_LEVEL3 = 1
endif
endif
ifeq ($(USE_OPENMP), 1)
# USE_SIMPLE_THREADED_LEVEL3 = 1
@ -600,9 +607,11 @@ endif
ifneq ($(ARCH), x86_64)
ifneq ($(ARCH), x86)
ifneq ($(CORE), LOONGSON3B)
NO_AFFINITY = 1
endif
endif
endif
ifdef NO_AFFINITY
CCOMMON_OPT += -DNO_AFFINITY

View File

@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...);
static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode,
unsigned flags) {
#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#else
return 0; //NULL Implementation on Loongson 3B 32bit.
#endif
#else
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags);
#endif
}
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {

View File

@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){
static inline unsigned int rpcc(void){
unsigned long ret;
#if defined(LOONGSON3A)
unsigned long long tmp;
__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
ret=tmp;
#if defined(LOONGSON3A) || defined(LOONGSON3B)
// unsigned long long tmp;
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
//ret=tmp;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
#else
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){
return ret;
}
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#ifndef NO_AFFINITY
#define WHEREAMI
static inline int WhereAmI(void){
int ret=0;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $0\n"
".set pop": "=r"(ret):: "memory");
return ret;
}
#endif
#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
@ -234,6 +254,11 @@ REALNAME: ;\
#define FIXED_PAGESIZE (16UL << 10)
#endif
#if defined(LOONGSON3B)
#define PAGESIZE (32UL << 10)
#define FIXED_PAGESIZE (32UL << 10)
#endif
#ifndef PAGESIZE
#define PAGESIZE (64UL << 10)
#endif
@ -245,7 +270,7 @@ REALNAME: ;\
#define MAP_ANONYMOUS MAP_ANON
#endif
#if defined(LOONGSON3A)
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#define PREFETCHD_(x) ld $0, x
#define PREFETCHD(x) PREFETCHD_(x)
#else

View File

@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
static char *cpuname[] = {
"UNKOWN",
"SICORTEX",
"LOONGSON3A"
"LOONGSON3A",
"LOONGSON3B"
};
int detect(void){
@ -101,6 +103,8 @@ int detect(void){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
@ -130,6 +134,8 @@ void get_architecture(void){
void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
}else{
printf("SICORTEX");
}
@ -149,6 +155,15 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{
printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n");
@ -164,6 +179,8 @@ void get_cpuconfig(void){
void get_libname(void){
if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
}else{
#ifdef __mips64
printf("mips64\n");

View File

@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
range_M[0] = 0;
i = arg -> m;
} else {
range_M[0] = range_M[0];
i = range_M[1] - range_M[0];
range_M[0] = range_m[0];
i = range_m[1] - range_m[0];
}
num_cpu_m = 0;

View File

@ -55,8 +55,8 @@ int CNAME(int mode,
range_M[0] = 0;
i = arg -> m;
} else {
range_M[0] = range_M[0];
i = range_M[1] - range_M[0];
range_M[0] = range_m[0];
i = range_m[1] - range_m[0];
}
num_cpu_m = 0;

View File

@ -389,12 +389,13 @@ static void *alloc_mmap(void *address){
if (map_address != (void *)-1) {
#ifdef OS_LINUX
#ifdef DEBUG
int ret;
#if 1
//#ifdef DEBUG
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("alloc_mmap:");
perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
}

View File

@ -696,5 +696,20 @@ void blas_set_parameter(void){
}
#endif
#endif
#if defined(LOONGSON3B)
#ifdef SMP
if(blas_num_threads == 1 || blas_num_threads == 2){
#endif
//single thread
dgemm_r = 640;
#ifdef SMP
}else{
//multi thread
dgemm_r = 160;
}
#endif
#endif
}
#endif

View File

@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_CELL */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */
/* #define FORCE_SPARC */
@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_LOONGSON3B
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "LOONGSON3B"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DLOONGSON3B " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "loongson3b"
#define CORENAME "LOONGSON3B"
#else
#endif
#ifdef FORCE_ITANIUM2
#define FORCE
#define ARCHITECTURE "IA64"

View File

@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
ifeq ($(TARGET), LOONGSON3B)
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@

View File

@ -0,0 +1,157 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(bk&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
res2 = res2*alpha;
C1[0] = C1[0]+res2;
res3 = res3*alpha;
C1[1] = C1[1]+res3;
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C1[0] = C1[0]+res1;
C0 = C0+1;
C1 = C1+1;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
C0 = C0+1;
}
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,280 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+2;
#endif
for (k=0; k<temp/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(temp&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
res2 = res2*alpha;
C1[0] = res2;
res3 = res3*alpha;
C1[1] = res3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C1[0] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off;
#endif
res0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp;
ptrbb += temp;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,838 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
for (k=0; k<bk/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(bk&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
load4 = res4*alphar;
C1[0] = C1[0]+load4;
load5 = res5*alphar;
C1[1] = C1[1]+load5;
load4 = res5*alphai;
C1[0] = C1[0]-load4;
load5 = res4*alphai;
C1[1] = C1[1]+load5;
load6 = res6*alphar;
C1[2] = C1[2]+load6;
load7 = res7*alphar;
C1[3] = C1[3]+load7;
load6 = res7*alphai;
C1[2] = C1[2]-load6;
load7 = res6*alphai;
C1[3] = C1[3]+load7;
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C1[0] = C1[0]+load2;
load3 = res3*alphar;
C1[1] = C1[1]+load3;
load2 = res3*alphai;
C1[0] = C1[0]-load2;
load3 = res2*alphai;
C1[1] = C1[1]+load3;
C0 = C0+2;
C1 = C1+2;
}
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -0,0 +1,923 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
FLOAT* C,BLASLONG ldc, BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 2;
#endif
for (k=0; k<temp/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(temp&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
load4 = res4*alphar-res5*alphai;
load5 = res5*alphar+res4*alphai;
C1[0] = load4;
C1[1] = load5;
load6 = res6*alphar-res7*alphai;
load7 = res7*alphar+res6*alphai;
C1[2] = load6;
C1[3] = load7;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C1[0] = load2;
C1[1] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
C1 = C1+2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -0,0 +1,64 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

47
param.h
View File

@ -1502,10 +1502,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 1024
#define SGEMM_DEFAULT_R 640
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R 1024
#define ZGEMM_DEFAULT_R 1024
#define CGEMM_DEFAULT_R 640
#define ZGEMM_DEFAULT_R 640
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16
#endif
#ifdef LOONGSON3B
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 24
#define CGEMM_DEFAULT_P 24
#define ZGEMM_DEFAULT_P 20
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 64
#define SGEMM_DEFAULT_R 512
#define DGEMM_DEFAULT_R 512
#define CGEMM_DEFAULT_R 512
#define ZGEMM_DEFAULT_R 512
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000