Merge branch 'release-0.1.0' into develop

This commit is contained in:
Xianyi Zhang 2012-03-23 18:53:51 +08:00
commit 5cbbc496b0
42 changed files with 25904 additions and 48 deletions

View File

@ -1,4 +1,22 @@
OpenBLAS ChangeLog
====================================================================
Version 0.1.0
23-Mar-2012
common:
* Set soname of shared library on Linux.
* Added LIBNAMESUFFIX flag in Makefile.rule. The user can use
this flag to control the library name, e.g. libopenblas.a,
libopenblas_ifort.a or libopenblas_omp.a.
* Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule.
The lib use single thread in GEMM function with small matrices.
x86/x86_64:
* Used GEMV SSE/SSE2 kernels on x86 32-bit.
* Exported CBLAS functions in Windows DLL.
MIPS64:
* Completed Level-3 BLAS optimization on Loongson 3A CPU.
* Improved GEMV performance on Loongson 3A CPU.
* Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT)
====================================================================
Version 0.1 alpha2.5
19-Feb-2012

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.1alpha2.5
VERSION = 0.1.0
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -279,7 +279,12 @@ endif
BINARY_DEFINED = 1
endif
ifeq ($(CORE), LOONGSON3A)
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
@ -534,8 +539,10 @@ ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
ifeq ($(ARCH), mips64)
ifneq ($(CORE), LOONGSON3B)
USE_SIMPLE_THREADED_LEVEL3 = 1
endif
endif
ifeq ($(USE_OPENMP), 1)
# USE_SIMPLE_THREADED_LEVEL3 = 1
@ -600,9 +607,11 @@ endif
ifneq ($(ARCH), x86_64)
ifneq ($(ARCH), x86)
ifneq ($(CORE), LOONGSON3B)
NO_AFFINITY = 1
endif
endif
endif
ifdef NO_AFFINITY
CCOMMON_OPT += -DNO_AFFINITY

1
README
View File

@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).

View File

@ -68,9 +68,17 @@ extern long int syscall (long int __sysno, ...);
static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode,
unsigned flags) {
#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#else
return 0; //NULL Implementation on Loongson 3B 32bit.
#endif
#else
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags);
// unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#endif
}
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {

View File

@ -2127,7 +2127,9 @@
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sgemm_p;
extern BLASLONG sgemm_q;
extern BLASLONG sgemm_r;

View File

@ -101,10 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){
static inline unsigned int rpcc(void){
unsigned long ret;
#if defined(LOONGSON3A)
unsigned long long tmp;
__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
ret=tmp;
#if defined(LOONGSON3A) || defined(LOONGSON3B)
// unsigned long long tmp;
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
//ret=tmp;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
#else
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
@ -114,6 +119,21 @@ static inline unsigned int rpcc(void){
return ret;
}
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#ifndef NO_AFFINITY
#define WHEREAMI
static inline int WhereAmI(void){
int ret=0;
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $0\n"
".set pop": "=r"(ret):: "memory");
return ret;
}
#endif
#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
@ -152,6 +172,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define CMPEQ c.eq.d
#define CMPLE c.le.d
#define CMPLT c.lt.d
#define NEG neg.d
#else
#define LD lwc1
#define ST swc1
@ -170,6 +191,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define CMPEQ c.eq.s
#define CMPLE c.le.s
#define CMPLT c.lt.s
#define PLU plu.ps
#define PLL pll.ps
#define PUU puu.ps
#define PUL pul.ps
#define MADPS madd.ps
#define CVTU cvt.s.pu
#define CVTL cvt.s.pl
#define NEG neg.s
#endif
#if defined(__64BIT__) && defined(USE64BITINT)
@ -218,13 +247,18 @@ REALNAME: ;\
#define SEEK_ADDRESS
#define BUFFER_SIZE ( 8 << 20)
#define BUFFER_SIZE ( 32 << 20)
#if defined(LOONGSON3A)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif
#if defined(LOONGSON3B)
#define PAGESIZE (32UL << 10)
#define FIXED_PAGESIZE (32UL << 10)
#endif
#ifndef PAGESIZE
#define PAGESIZE (64UL << 10)
#endif
@ -236,7 +270,7 @@ REALNAME: ;\
#define MAP_ANONYMOUS MAP_ANON
#endif
#if defined(LOONGSON3A)
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#define PREFETCHD_(x) ld $0, x
#define PREFETCHD(x) PREFETCHD_(x)
#else

View File

@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
static char *cpuname[] = {
"UNKOWN",
"SICORTEX",
"LOONGSON3A"
"LOONGSON3A",
"LOONGSON3B"
};
int detect(void){
@ -101,6 +103,8 @@ int detect(void){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
@ -130,6 +134,8 @@ void get_architecture(void){
void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
}else{
printf("SICORTEX");
}
@ -149,6 +155,15 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{
printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n");
@ -164,6 +179,8 @@ void get_cpuconfig(void){
void get_libname(void){
if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
}else{
#ifdef __mips64
printf("mips64\n");

View File

@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
range_M[0] = 0;
i = arg -> m;
} else {
range_M[0] = range_M[0];
i = range_M[1] - range_M[0];
range_M[0] = range_m[0];
i = range_m[1] - range_m[0];
}
num_cpu_m = 0;

View File

@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].args = arg;
queue[num_cpu].range_m = range_m;
queue[num_cpu].range_n = &range[num_cpu];
queue[num_cpu].sa = NULL;
#if defined(LOONGSON3A)
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
#endif
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
}
if (num_cpu) {
#if defined(LOONGSON3A)
queue[0].sa = sa;
queue[0].sb = sb;
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else
queue[0].sa = sa;
queue[0].sb = sb;
#endif
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu,

View File

@ -55,8 +55,8 @@ int CNAME(int mode,
range_M[0] = 0;
i = arg -> m;
} else {
range_M[0] = range_M[0];
i = range_M[1] - range_M[0];
range_M[0] = range_m[0];
i = range_m[1] - range_m[0];
}
num_cpu_m = 0;

View File

@ -500,6 +500,7 @@ static int blas_monitor(void *arg){
/* Initializing routine */
int blas_thread_init(void){
BLASLONG i;
int ret;
#ifdef NEED_STACKATTR
pthread_attr_t attr;
#endif
@ -545,12 +546,16 @@ int blas_thread_init(void){
pthread_cond_init (&thread_status[i].wakeup, NULL);
#ifdef NEED_STACKATTR
pthread_create(&blas_threads[i], &attr,
ret=pthread_create(&blas_threads[i], &attr,
(void *)&blas_thread_server, (void *)i);
#else
pthread_create(&blas_threads[i], NULL,
ret=pthread_create(&blas_threads[i], NULL,
(void *)&blas_thread_server, (void *)i);
#endif
if(ret!=0){
fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
exit(1);
}
}
#ifdef MONITOR
@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads;
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
#endif
}
void openblas_set_num_threads(int num_threads) {

View File

@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number);
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
#endif
}
void openblas_set_num_threads(int num_threads) {

View File

@ -390,11 +390,11 @@ static void *alloc_mmap(void *address){
#ifdef OS_LINUX
#ifdef DEBUG
int ret;
int ret=0;
ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
if(ret==-1){
int errsv=errno;
perror("alloc_mmap:");
perror("OpenBLAS alloc_mmap:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
}
@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#ifndef DYNAMIC_ARCH
blas_set_parameter();
#endif

View File

@ -45,8 +45,22 @@ int get_L2_size(void);
#define DEFAULT_GEMM_P 128
#define DEFAULT_GEMM_Q 128
#define DEFAULT_GEMM_R 128
#define DEFAULT_GEMM_OFFSET_A 0
#define DEFAULT_GEMM_OFFSET_B 0
/* Global Parameter */
#if GEMM_OFFSET_A == gemm_offset_a
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
#else
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
#endif
#if GEMM_OFFSET_B == gemm_offset_b
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
#else
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
#endif
#if SGEMM_P == sgemm_p
BLASLONG sgemm_p = DEFAULT_GEMM_P;
#else
@ -666,3 +680,36 @@ void blas_set_parameter(void){
#endif
#endif
#if defined(ARCH_MIPS64)
void blas_set_parameter(void){
#if defined(LOONGSON3A)
#ifdef SMP
if(blas_num_threads == 1){
#endif
//single thread
dgemm_r = 1024;
#ifdef SMP
}else{
//multi thread
dgemm_r = 200;
}
#endif
#endif
#if defined(LOONGSON3B)
#ifdef SMP
if(blas_num_threads == 1 || blas_num_threads == 2){
#endif
//single thread
dgemm_r = 640;
#ifdef SMP
}else{
//multi thread
dgemm_r = 160;
}
#endif
#endif
}
#endif

View File

@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_CELL */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */
/* #define FORCE_SPARC */
@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_LOONGSON3B
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "LOONGSON3B"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DLOONGSON3B " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "loongson3b"
#define CORENAME "LOONGSON3B"
#else
#endif
#ifdef FORCE_ITANIUM2
#define FORCE
#define ARCHITECTURE "IA64"

View File

@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO,
FLOAT *sa, *sb;
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO,
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY)
@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
FLOAT *sa, *sb;
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY)

View File

@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
ifeq ($(TARGET), LOONGSON3B)
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@

View File

@ -0,0 +1,157 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(bk&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
res2 = res2*alpha;
C1[0] = C1[0]+res2;
res3 = res3*alpha;
C1[1] = C1[1]+res3;
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C1[0] = C1[0]+res1;
C0 = C0+1;
C1 = C1+1;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
for (k=0; k<bk; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
C0 = C0+1;
}
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,280 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+2;
#endif
for (k=0; k<temp/4; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*1+0];
load5 = ptrbb[2*1+0];
res0 = res0+load4*load5;
load6 = ptrba[2*1+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*1+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
load0 = ptrba[2*2+0];
load1 = ptrbb[2*2+0];
res0 = res0+load0*load1;
load2 = ptrba[2*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*2+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
load4 = ptrba[2*3+0];
load5 = ptrbb[2*3+0];
res0 = res0+load4*load5;
load6 = ptrba[2*3+1];
res1 = res1+load6*load5;
load7 = ptrbb[2*3+1];
res2 = res2+load4*load7;
res3 = res3+load6*load7;
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
for (k=0; k<(temp&3); k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res2 = res2+load0*load3;
res3 = res3+load2*load3;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
res2 = res2*alpha;
C1[0] = res2;
res3 = res3*alpha;
C1[1] = res3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
C1 = C1+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrbb[2*0+1];
res1 = res1+load0*load2;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C1[0] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || \
(!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2;
#else
temp = off+1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[2*0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
res1 = res1*alpha;
C0[1] = res1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off;
ptrbb = bb+off;
#endif
res0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
load0 = ptrba[0+0];
load1 = ptrbb[0+0];
res0 = res0+load0*load1;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = res0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp;
ptrbb += temp;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -0,0 +1,838 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
, BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
for (k=0; k<bk/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(bk&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
load4 = res4*alphar;
C1[0] = C1[0]+load4;
load5 = res5*alphar;
C1[1] = C1[1]+load5;
load4 = res5*alphai;
C1[0] = C1[0]-load4;
load5 = res4*alphai;
C1[1] = C1[1]+load5;
load6 = res6*alphar;
C1[2] = C1[2]+load6;
load7 = res7*alphar;
C1[3] = C1[3]+load7;
load6 = res7*alphai;
C1[2] = C1[2]-load6;
load7 = res6*alphai;
C1[3] = C1[3]+load7;
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C1[0] = C1[0]+load2;
load3 = res3*alphar;
C1[1] = C1[1]+load3;
load2 = res3*alphai;
C1[0] = C1[0]-load2;
load3 = res2*alphai;
C1[1] = C1[1]+load3;
C0 = C0+2;
C1 = C1+2;
}
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
load2 = res2*alphar;
C0[2] = C0[2]+load2;
load3 = res3*alphar;
C0[3] = C0[3]+load3;
load2 = res3*alphai;
C0[2] = C0[2]-load2;
load3 = res2*alphai;
C0[3] = C0[3]+load3;
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
ptrbb = bb;
res0 = 0;
res1 = 0;
for (k=0; k<bk; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar;
C0[0] = C0[0]+load0;
load1 = res1*alphar;
C0[1] = C0[1]+load1;
load0 = res1*alphai;
C0[0] = C0[0]-load0;
load1 = res0*alphai;
C0[1] = C0[1]+load1;
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -0,0 +1,923 @@
#include "common.h"
/********************************
ADD1 a*c
ADD2 b*c
ADD3 a*d
ADD4 b*d
*********************************/
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
FLOAT* C,BLASLONG ldc, BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
C0 = C;
C1 = C0+2*ldc;
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 2;
#endif
for (k=0; k<temp/4; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2-load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6-load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1+load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3+load13*load9;
res2 = res2+load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5+load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7+load13*load14;
res6 = res6+load13*load15;
res7 = res7-load12*load15;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0+load10*load11;
res1 = res1+load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2+load13*load11;
res3 = res3+load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4+load10*load15;
res5 = res5+load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6+load13*load15;
res7 = res7+load12*load15;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*1+0];
load9 = ptrbb[4*1+0];
res0 = res0+load8*load9;
load10 = ptrba[4*1+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*1+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*1+2];
res2 = res2+load12*load9;
load13 = ptrba[4*1+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*1+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*1+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
load0 = ptrba[4*2+0];
load1 = ptrbb[4*2+0];
res0 = res0+load0*load1;
load2 = ptrba[4*2+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*2+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*2+2];
res2 = res2+load4*load1;
load5 = ptrba[4*2+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*2+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*2+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
load8 = ptrba[4*3+0];
load9 = ptrbb[4*3+0];
res0 = res0+load8*load9;
load10 = ptrba[4*3+1];
res1 = res1-load10*load9;
load11 = ptrbb[4*3+1];
res0 = res0-load10*load11;
res1 = res1-load8*load11;
load12 = ptrba[4*3+2];
res2 = res2+load12*load9;
load13 = ptrba[4*3+3];
res3 = res3-load13*load9;
res2 = res2-load13*load11;
res3 = res3-load12*load11;
load14 = ptrbb[4*3+2];
res4 = res4+load8*load14;
res5 = res5-load10*load14;
load15 = ptrbb[4*3+3];
res4 = res4-load10*load15;
res5 = res5-load8*load15;
res6 = res6+load12*load14;
res7 = res7-load13*load14;
res6 = res6-load13*load15;
res7 = res7-load12*load15;
#endif
ptrba = ptrba+16;
ptrbb = ptrbb+16;
}
for (k=0; k<(temp&3); k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6-load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5+load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7+load5*load6;
res6 = res6+load5*load7;
res7 = res7-load4*load7;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4+load2*load7;
res5 = res5+load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6+load5*load7;
res7 = res7+load4*load7;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
load6 = ptrbb[4*0+2];
res4 = res4+load0*load6;
res5 = res5-load2*load6;
load7 = ptrbb[4*0+3];
res4 = res4-load2*load7;
res5 = res5-load0*load7;
res6 = res6+load4*load6;
res7 = res7-load5*load6;
res6 = res6-load5*load7;
res7 = res7-load4*load7;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
load4 = res4*alphar-res5*alphai;
load5 = res5*alphar+res4*alphai;
C1[0] = load4;
C1[1] = load5;
load6 = res6*alphar-res7*alphai;
load7 = res7*alphar+res6*alphai;
C1[2] = load6;
C1[3] = load7;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2;
#else
temp -= 2;
#endif
ptrba += temp*2*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
C1 = C1+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off+1;
#else
temp = off+2;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3+load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3-load0*load5;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2+load2*load5;
res3 = res3+load0*load5;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[4*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[4*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrbb[4*0+2];
res2 = res2+load0*load4;
res3 = res3-load2*load4;
load5 = ptrbb[4*0+3];
res2 = res2-load2*load5;
res3 = res3-load0*load5;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C1[0] = load2;
C1[1] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 2;
#endif
ptrba += temp*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
C1 = C1+2;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/2; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb+off*2;
#endif
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 2;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2-load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3+load5*load1;
res2 = res2+load5*load3;
res3 = res3-load4*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2+load5*load3;
res3 = res3+load4*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[4*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[4*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
load4 = ptrba[4*0+2];
res2 = res2+load4*load1;
load5 = ptrba[4*0+3];
res3 = res3-load5*load1;
res2 = res2-load5*load3;
res3 = res3-load4*load3;
#endif
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
load2 = res2*alphar-res3*alphai;
load3 = res3*alphar+res2*alphai;
C0[2] = load2;
C0[3] = load3;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#ifdef LEFT
temp -= 2;
#else
temp -= 1;
#endif
ptrba += temp*2*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2;
#endif
C0 = C0+4;
}
for (i=0; i<(bm&1); i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0 = 0;
res1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off + 1;
#else
temp = off + 1;
#endif
for (k=0; k<temp; k+=1)
{
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1+load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1-load0*load3;
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0+load2*load3;
res1 = res1+load0*load3;
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
load0 = ptrba[2*0+0];
load1 = ptrbb[2*0+0];
res0 = res0+load0*load1;
load2 = ptrba[2*0+1];
res1 = res1-load2*load1;
load3 = ptrbb[2*0+1];
res0 = res0-load2*load3;
res1 = res1-load0*load3;
#endif
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
load0 = res0*alphar-res1*alphai;
load1 = res1*alphar+res0*alphai;
C0[0] = load0;
C0[1] = load1;
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1;
#else
temp -= 1;
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1;
#endif
C0 = C0+2;
}
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

View File

@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LN
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LT
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RN
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
CGEMM3MKERNEL = zgemm3m_kernel.S
ZGEMM3MKERNEL = zgemm3m_kernel.S

View File

@ -1,18 +1,48 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMMKERNEL = sgemm_kernel_loongson3a.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = gemm_kernel_loongson3a.S
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -0,0 +1,64 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,101 @@
#include "common.h"
//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
{
BLASLONG kx=0, ky=0;
if(!ALPHA)
return 0;
//if(INCX < 0)
// kx = (1-N) * INCX;
// INCX = -INCX;
//if(INCY < 0)
// ky = (1-M) * INCY;
// INCY = -INCY;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 4;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0;
if(ALPHA == 1) {
if(INCY == 1) {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[i + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M);) {
spec_loop_alpha1;
}
}
} else {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0, h = ky;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[h + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M);) {
norm_loop_alpha1;
}
}
}
} else {
if(INCY == 1) {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[i + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M);) {
spec_loop;
}
}
} else {
for(k=kx; likely(j < N); j++, k += INCX) {
BLASLONG i = 0, h = ky;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(Y[h + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M);) {
norm_loop;
}
}
}
}
return 0;
}

View File

@ -0,0 +1,93 @@
#include "common.h"
//These are auto-tuning codes on Loongson-3A platform.
//#define prefetch(x) __builtin_prefetch(x)
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!ALPHA)
return 0;
// if(INCX < 0)
// INCX = -INCX;
// if(INCY < 0)
// INCY = -INCY;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 3;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0;
if(ALPHA == 1) {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[i + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M);) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0, h = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[h + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M);) {
norm_loop_alpha1;
}
}
}
} else {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[i + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M);) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCY) {
BLASLONG i = 0, h = 0;
for(; likely(i < tMQ);) {
prefetch(A[LDA * j + i + fahead]);
prefetch(X[h + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M);) {
norm_loop;
}
}
}
}
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,139 @@
#include "common.h"
//typedef int BLASLONG;
//typedef double FLOAT;
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#if !defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_0
#define spec_loop spec_loop_0
#define norm_loop_alpha1 norm_loop_alpha1_0
#define norm_loop norm_loop_0
#endif
#if defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_1
#define spec_loop spec_loop_1
#define norm_loop_alpha1 norm_loop_alpha1_1
#define norm_loop norm_loop_1
#endif
#if !defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_2
#define spec_loop spec_loop_2
#define norm_loop_alpha1 norm_loop_alpha1_2
#define norm_loop norm_loop_2
#endif
#if defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_3
#define spec_loop spec_loop_3
#define norm_loop_alpha1 norm_loop_alpha1_3
#define norm_loop norm_loop_3
#endif
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!rALPHA && iALPHA)
return 0;
BLASLONG fahead = 60;
BLASLONG spec_unroll = 2;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0, jj = 0;
if(rALPHA == 1 && iALPHA == 0) {
if(INCY == 1) {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[ii + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M); i++) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[iii + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M); i++) {
norm_loop_alpha1;
}
}
}
} else {
FLOAT rTmp, iTmp;
if(INCY == 1) {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[ii + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M); i++) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(Y[iii + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M); i++) {
norm_loop;
}
}
}
}
return 0;
}

View File

@ -0,0 +1,125 @@
#include "common.h"
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#if !defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_0
#define spec_loop spec_loop_0
#define norm_loop_alpha1 norm_loop_alpha1_0
#define norm_loop norm_loop_0
#endif
#if defined(CONJ) && !defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_1
#define spec_loop spec_loop_1
#define norm_loop_alpha1 norm_loop_alpha1_1
#define norm_loop norm_loop_1
#endif
#if !defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_2
#define spec_loop spec_loop_2
#define norm_loop_alpha1 norm_loop_alpha1_2
#define norm_loop norm_loop_2
#endif
#if defined(CONJ) && defined(XCONJ)
#define spec_loop_alpha1 spec_loop_alpha1_3
#define spec_loop spec_loop_3
#define norm_loop_alpha1 norm_loop_alpha1_3
#define norm_loop norm_loop_3
#endif
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
if(!rALPHA && iALPHA)
return 0;
BLASLONG fahead = 30;
BLASLONG spec_unroll = 2;
BLASLONG tMQ = M - M % spec_unroll;
BLASLONG j = 0, k = 0, jj = 0;
if(rALPHA == 1 && iALPHA == 0) {
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[ii + fahead]);
/*loop_mark*/ spec_loop_alpha1;
/*loop_mark*/ spec_loop_alpha1;
}
for(; likely(i < M); i++) {
spec_loop_alpha1;
}
}
} else {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[iii + fahead]);
/*loop_mark*/ norm_loop_alpha1;
/*loop_mark*/ norm_loop_alpha1;
}
for(; likely(i < M); i++) {
norm_loop_alpha1;
}
}
}
} else {
FLOAT rTmp, iTmp;
if(INCX == 1) {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[ii + fahead]);
/*loop_mark*/ spec_loop;
/*loop_mark*/ spec_loop;
}
for(; likely(i < M); i++) {
spec_loop;
}
}
} else {
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
BLASLONG i = 0, ii = 0, iii = 0;
for(; likely(i < tMQ); i += spec_unroll) {
prefetch(A[jj + ii + fahead]);
prefetch(X[iii + fahead]);
/*loop_mark*/ norm_loop;
/*loop_mark*/ norm_loop;
}
for(; likely(i < M); i++) {
norm_loop;
}
}
}
}
return 0;
}

79
param.h
View File

@ -1480,31 +1480,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 1
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 32
#define DGEMM_DEFAULT_P 32
#define CGEMM_DEFAULT_P 108
#define ZGEMM_DEFAULT_P 112
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_Q 116
#define DGEMM_DEFAULT_Q 116
#define CGEMM_DEFAULT_Q 144
#define ZGEMM_DEFAULT_Q 72
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 44
#define CGEMM_DEFAULT_P 64
#define ZGEMM_DEFAULT_P 32
#define SGEMM_DEFAULT_R 1000
#define DGEMM_DEFAULT_R 1000
#define CGEMM_DEFAULT_R 2000
#define ZGEMM_DEFAULT_R 2000
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 92
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 640
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R 640
#define ZGEMM_DEFAULT_R 640
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16
#endif
#ifdef LOONGSON3B
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 24
#define CGEMM_DEFAULT_P 24
#define ZGEMM_DEFAULT_P 20
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 64
#define SGEMM_DEFAULT_R 512
#define DGEMM_DEFAULT_R 512
#define CGEMM_DEFAULT_R 512
#define ZGEMM_DEFAULT_R 512
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16
#endif

View File

@ -1301,6 +1301,8 @@
NC = 0
RESET = .TRUE.
ERRMAX = RZERO
RALS = RONE
RBETS = RONE
*
DO 100 IN = 1, NIDIM
N = IDIM( IN )

View File

@ -1303,6 +1303,8 @@
NC = 0
RESET = .TRUE.
ERRMAX = RZERO
RALS = RONE
RBETS = RONE
*
DO 100 IN = 1, NIDIM
N = IDIM( IN )