From b95ad4cfafdadb2cf7d7baae70d42a83d246f50a Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 9 Nov 2011 19:28:22 +0000 Subject: [PATCH 1/9] Support detecting ICT Loongson-3B CPU. --- common_mips64.h | 12 +++++- cpuid_mips.c | 19 ++++++++- driver/others/parameter.c | 2 +- getarch.c | 15 ++++++++ kernel/mips64/KERNEL.LOONGSON3B | 68 +++++++++++++++++++++++++++++++++ param.h | 41 ++++++++++++++++++++ 6 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 kernel/mips64/KERNEL.LOONGSON3B diff --git a/common_mips64.h b/common_mips64.h index 35d8265bc..15f947eb8 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -101,10 +101,13 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) unsigned long long tmp; __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); ret=tmp; +#elif defined(LOONGSON3B) + //Temp Implementation. + return 1; #else __asm__ __volatile__(".set push \n" ".set mips32r2\n" @@ -234,6 +237,11 @@ REALNAME: ;\ #define FIXED_PAGESIZE (16UL << 10) #endif +#if defined(LOONGSON3B) +#define PAGESIZE (16UL << 10) +#define FIXED_PAGESIZE (16UL << 10) +#endif + #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif @@ -245,7 +253,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) || defined(LOONGSON3B) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips.c b/cpuid_mips.c index f50a4ec3e..217492dd7 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_SICORTEX 1 #define CPU_LOONGSON3A 2 +#define CPU_LOONGSON3B 3 static char *cpuname[] = { "UNKOWN", "SICORTEX", - "LOONGSON3A" + "LOONGSON3A", + "LOONGSON3B" }; int detect(void){ @@ -101,6 +103,8 @@ int detect(void){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ @@ -130,6 +134,8 @@ void get_architecture(void){ void get_subarchitecture(void){ if(detect()==CPU_LOONGSON3A) { printf("LOONGSON3A"); + }else if(detect()==CPU_LOONGSON3B){ + printf("LOONGSON3B"); }else{ printf("SICORTEX"); } @@ -149,6 +155,15 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_LOONGSON3B){ + printf("#define LOONGSON3B\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define SICORTEX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -164,6 +179,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_LOONGSON3A) { printf("loongson3a\n"); + }else if(detect()==CPU_LOONGSON3B) { + printf("loongson3b\n"); }else{ #ifdef __mips64 printf("mips64\n"); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index fc7f0447e..3e660220e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -683,7 +683,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) || defined(LOONGSON3B) #ifdef SMP if(blas_num_threads == 1){ #endif diff --git a/getarch.c b/getarch.c index df052df8a..5b614472a 100644 --- a/getarch.c +++ b/getarch.c @@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ +/* #define FORCE_LOONGSON3B */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_GENERIC */ /* #define FORCE_SPARC */ @@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3B +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "LOONGSON3B" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DLOONGSON3B " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "loongson3b" +#define CORENAME "LOONGSON3B" +#else +#endif + #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B new file mode 100644 index 000000000..fc247e473 --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3B @@ -0,0 +1,68 @@ +SAXPYKERNEL=axpy_loongson3a.S +DAXPYKERNEL=daxpy_loongson3a_simd.S + +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c + + +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/param.h b/param.h index 4ffe05cf8..39f0d996c 100644 --- a/param.h +++ b/param.h @@ -1513,6 +1513,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#ifdef LOONGSON3B +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 + +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 80 + +#define SGEMM_DEFAULT_R 1024 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_R 1024 + +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 + +#define SYMV_P 16 +#endif + #ifdef GENERIC #define SNUMOPT 2 From d1baf14a64d93062763f9899fa9c2d7e4bad62a3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 11 Nov 2011 17:49:41 +0000 Subject: [PATCH 2/9] Enable thread affinity on Loongson 3B. Fixed the bug of reading cycle counter. In Loongson 3A and 3B, the CPU core increases the counter in every 2 cycles by default. --- Makefile.system | 2 ++ common_mips64.h | 28 +++++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Makefile.system b/Makefile.system index 84f41a78f..985f95084 100644 --- a/Makefile.system +++ b/Makefile.system @@ -591,9 +591,11 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) +ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif +endif ifdef NO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY diff --git a/common_mips64.h b/common_mips64.h index 15f947eb8..5db96c4aa 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -101,13 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) - unsigned long long tmp; - __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); - ret=tmp; -#elif defined(LOONGSON3B) - //Temp Implementation. - return 1; +#if defined(LOONGSON3A) || defined(LOONGSON3B) + // unsigned long long tmp; + //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); + //ret=tmp; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $2\n" + ".set pop": "=r"(ret):: "memory"); + #else __asm__ __volatile__(".set push \n" ".set mips32r2\n" @@ -117,6 +119,18 @@ static inline unsigned int rpcc(void){ return ret; } +//#if defined(LOONGSON3A) || defined(LOONGSON3B) +static inline int WhereAmI(void){ + int ret=0; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $0\n" + ".set pop": "=r"(ret):: "memory"); + return ret; + +} +//#endif + static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } From 285e69e2d16c6c4d5addcc124801c1aed01b1e2d Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 17 Nov 2011 16:46:26 +0000 Subject: [PATCH 3/9] Disable using simple thread level3 to fix a bug on Loongson 3B. --- Makefile.system | 9 ++++++++- common_mips64.h | 9 +++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 985f95084..da3820fec 100644 --- a/Makefile.system +++ b/Makefile.system @@ -275,7 +275,12 @@ endif BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) +CCOMMON_OPT += -march=mips64 +FCOMMON_OPT += -march=mips64 +endif + +ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif @@ -529,8 +534,10 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) +ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif +endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 diff --git a/common_mips64.h b/common_mips64.h index 5db96c4aa..560f2c372 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -119,7 +119,8 @@ static inline unsigned int rpcc(void){ return ret; } -//#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3A) || defined(LOONGSON3B) +#define WHEREAMI static inline int WhereAmI(void){ int ret=0; __asm__ __volatile__(".set push \n" @@ -129,7 +130,7 @@ static inline int WhereAmI(void){ return ret; } -//#endif +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; @@ -252,8 +253,8 @@ REALNAME: ;\ #endif #if defined(LOONGSON3B) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) +#define PAGESIZE (32UL << 10) +#define FIXED_PAGESIZE (32UL << 10) #endif #ifndef PAGESIZE From ef6f7f32ae1ed01d65acce15d6c209ee5caee4c0 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 23 Nov 2011 17:17:41 +0000 Subject: [PATCH 4/9] Fixed mbind bug on Loongson 3B. Check the return value of my_mbind function. --- common_linux.h | 4 ++++ driver/others/memory.c | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/common_linux.h b/common_linux.h index 8b3d44bfa..45a688d23 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,9 +68,13 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { +#if defined (LOONGSON3B) + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); +#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/driver/others/memory.c b/driver/others/memory.c index ac9c87850..feb45eb58 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -389,12 +389,13 @@ static void *alloc_mmap(void *address){ if (map_address != (void *)-1) { #ifdef OS_LINUX -#ifdef DEBUG - int ret; +#if 1 + //#ifdef DEBUG + int ret=0; ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; - perror("alloc_mmap:"); + perror("OpenBLAS alloc_mmap:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); } From 8163ab7e55969d395a6ecd8881b2678e38e8b146 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Wed, 23 Nov 2011 18:40:35 +0000 Subject: [PATCH 5/9] Change the block size on Loongson 3B. --- driver/level3/gemm_thread_mn.c | 4 ++-- driver/level3/gemm_thread_variable.c | 4 ++-- driver/others/parameter.c | 17 ++++++++++++++++- param.h | 6 +++--- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 321e88f0c..b81c6fa40 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 9d83e950a..9ffe17040 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -55,8 +55,8 @@ int CNAME(int mode, range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 3e660220e..21f56e889 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -683,7 +683,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3A) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -696,5 +696,20 @@ void blas_set_parameter(void){ } #endif #endif + +#if defined(LOONGSON3B) +#ifdef SMP + if(blas_num_threads == 1 || blas_num_threads == 2){ +#endif + //single thread + dgemm_r = 640; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 160; + } +#endif +#endif + } #endif diff --git a/param.h b/param.h index 39f0d996c..610eb5fab 100644 --- a/param.h +++ b/param.h @@ -1502,10 +1502,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 1024 +#define SGEMM_DEFAULT_R 640 #define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R 1024 -#define ZGEMM_DEFAULT_R 1024 +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 From 66904fc4e8c43d05231487ab0e063417141be4f6 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Fri, 25 Nov 2011 11:20:25 +0000 Subject: [PATCH 6/9] BLAS3 used standard MIPS instructions without extensions on Loongson 3B. --- kernel/mips64/KERNEL.LOONGSON3B | 16 +- kernel/mips64/cgemm_kernel_loongson3b_2x2.S | 1468 +++++++++++ kernel/mips64/dgemm_kernel_loongson3b_4x4.S | 2579 +++++++++++++++++++ kernel/mips64/sgemm_kernel_loongson3b_4x4.S | 2579 +++++++++++++++++++ kernel/mips64/zgemm_kernel_loongson3b_2x2.S | 1468 +++++++++++ param.h | 4 +- 6 files changed, 8100 insertions(+), 14 deletions(-) create mode 100644 kernel/mips64/cgemm_kernel_loongson3b_2x2.S create mode 100644 kernel/mips64/dgemm_kernel_loongson3b_4x4.S create mode 100644 kernel/mips64/sgemm_kernel_loongson3b_4x4.S create mode 100644 kernel/mips64/zgemm_kernel_loongson3b_2x2.S diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B index fc247e473..b98f263c4 100644 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ b/kernel/mips64/KERNEL.LOONGSON3B @@ -11,33 +11,25 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c -SGEMMKERNEL = sgemm_kernel_8x4_ps.S -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMKERNEL = sgemm_kernel_loongson3b_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMKERNEL = dgemm_kernel_loongson3b_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S -CGEMMINCOPY = ../generic/zgemm_ncopy_4.c -CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMKERNEL = cgemm_kernel_loongson3b_2x2.S CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMKERNEL = zgemm_kernel_loongson3b_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o diff --git a/kernel/mips64/cgemm_kernel_loongson3b_2x2.S b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S new file mode 100644 index 000000000..5ded7aed0 --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S @@ -0,0 +1,1468 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) + + MOV c41, c11 + MOV c42, c11 + FETCH $0, 4 * SIZE(CO2) + + MOV c43, c11 + NOP + FETCH $0, 4 * SIZE(CO1) + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddiu AO, AO, 16 * SIZE + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + daddu PREB, PREB, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + NOP + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD1 c31, c31, a7, b7 # A1xB2 + NOP + + MADD3 c33, c33, a7, b8 + daddiu PREB, PREB, 16 * SIZE + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L36 + NOP + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu PREA, PREA, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + +# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + +# gsLQC1(R13, F13, F12, 2) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + +# gsLQC1(R12, F11, F10, 3) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + daddiu L, L, -1 + +# gsLQC1(R13, F16, F15, 3) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/kernel/mips64/dgemm_kernel_loongson3b_4x4.S b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S new file mode 100644 index 000000000..4a8c9b0e4 --- /dev/null +++ b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S @@ -0,0 +1,2579 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # OFFSET is relate to the data part +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte + +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,C,LDC + + daddu PREB,BO,SPANB # PreB point next panelB + daddu CO3,CO2,LDC + + daddu PREA,AO,SPANA + daddu CO4,CO3,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + beqz M,.L14_M2 + daddu C,CO4,LDC # move C to next panel Cj + +.L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) +#else + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K # move A B to data part + daddu B,BO,TEMP +#endif + + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 4 # S=L,U=L +#else + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part +#endif + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + dsra K,KCO,2 # K=KCO/2 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 # clear 16 results registers +#endif + + .align 5 +.L11: # kr=4 + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + FETCH $0,(PREB) + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + LD b6,6*SIZE(B) + FETCH $0,(PREA) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + LD a7,7*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,7*SIZE(B) + +.L12: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,8*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,9*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,8*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,9*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,10*SIZE(A) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,10*SIZE(B) + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + LD a3,11*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + LD b3,11*SIZE(B) + +.L13: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,12*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,13*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,12*SIZE(B) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,13*SIZE(B) + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,14*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,16*SIZE # 4mr*4kr + LD b6,14*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,16*SIZE # 4nr*4kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L14: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + LD b0,0(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,16*SIZE + LD b1,1*SIZE(B) + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + bnez K,.L11 + LD b3,3*SIZE(B) + + +.L15: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP, 2 +#endif + beqz K,.L18 + nop + +.L16: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,8*SIZE # 4mr*2kr + LD b6,6*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,8*SIZE # 4nr*2kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L17: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,0*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,1*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREA,PREA,8*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + daddu PREB,PREB,8*SIZE + LD b3,3*SIZE(B) + + +.L18: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # 4mr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 4nr*kr + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COi += 4 + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + daddu PREB,BO,SPANB + + bnez M,.L10 + daddu CO4,CO4,4*SIZE + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) + MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) + MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + daddiu CO1,CO1, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 + nop +#endif + + + .align 3 +.L14_M2: + andi M, MCO, 2 # nr=4,mr=2 + beqz M,.L14_M1 + nop + +.L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 +#else + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 +#endif + dsra K,TEMP,2 + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 # clear 2*4=8 results registers + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + dsra K,KCO,2 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 + +#endif + +.L21: # nr=4,mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD a3,5*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,8*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,9*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,10*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,11*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddiu K,K,-1 + + MADD t11,t11,a2,b0 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b0 + LD a7,7*SIZE(A) + + MADD t12,t12,a2,b1 + LD b4,12*SIZE(B) + MADD t22,t22,a3,b1 + LD b5,13*SIZE(B) + + MADD t13,t13,a2,b2 + LD b6,14*SIZE(B) + MADD t23,t23,a3,b2 + LD b7,15*SIZE(B) + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + daddu A,A,8*SIZE # 2mr*4kr + daddu B,B,16*SIZE # 4nr*4kr + + MADD t11,t11,a6,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a6,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a7,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a6,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a7,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + + +.L25: +#ifndef TRMMKERNEL + andi K,KCO,2 # kr=2 +#else + andi K,TEMP,2 +#endif + beqz K,.L28 + nop + +.L26: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,4*SIZE # 2mr*2kr + daddu B,B,8*SIZE # 4nr*2kr + +.L27: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + +.L28: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write back part + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 + ST t23,1*SIZE(CO3) + daddu CO2,CO2,2*SIZE + + ST t14,0(CO4) + daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO4,CO4,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L14_M1: + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj + nop + +.L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + nop + beqz K,.L35 + nop + +#else + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + beqz K,.L35 + LD b3,3*SIZE(B) + +#endif + +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + + LD b6,6*SIZE(B) + LD b7,7*SIZE(B) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + LD a2, 2*SIZE(A) # a2 + MADD t11,t11,a1,b4 + + LD b0,8*SIZE(B) + LD b1,9*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,10*SIZE(B) + LD b3,11*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + LD a3, 3*SIZE(A) # a3 + MADD t11,t11,a2,b0 + daddiu K,K,-1 + + LD b4,12*SIZE(B) + LD b5,13*SIZE(B) + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr + + LD b6,14*SIZE(B) + LD b7,15*SIZE(B) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + + LD a0, 0*SIZE(A) # a0 + daddu B,B,16*SIZE # 4nr*4kr + MADD t11,t11,a3,b4 + + LD b0,0*SIZE(B) + MADD t12,t12,a3,b5 + LD b1,1*SIZE(B) + MADD t13,t13,a3,b6 + + LD b2,2*SIZE(B) + MADD t14,t14,a3,b7 + bnez K,.L31 + LD b3,3*SIZE(B) + + +.L35: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L38 + nop + +.L36: + LD a1,1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # mr*2kr + + LD b6,6*SIZE(B) + MADD t13,t13,a0,b2 + + LD b7,7*SIZE(B) + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE # 4nr*2kr + + +.L37: + LD a0,0(A) + MADD t11,t11,a1,b4 + + LD b0,0*SIZE(B) + LD b1,1*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + +.L38: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj + + .align 5 +.L0_N2: + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 + nop + +.L0_N2_Lb: + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + beqz M,.L12_M2 + nop + +.L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#endif + +.L41: # nr=2,mr=kr=4 + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + MADD t11,t11,a4,b4 + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,9*SIZE(A) + + MADD t12,t12,a4,b5 + LD b2,4*SIZE(B) + MADD t22,t22,a5,b5 + LD b3,5*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,10*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,11*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + MADD t11,t11,a0,b2 + LD a4,12*SIZE(A) + MADD t21,t21,a1,b2 + LD a5,13*SIZE(A) + + MADD t12,t12,a0,b3 + LD b6,6*SIZE(B) + MADD t22,t22,a1,b3 + LD b7,7*SIZE(B) + + MADD t31,t31,a2,b2 + LD a6,14*SIZE(A) + MADD t41,t41,a3,b2 + LD a7,15*SIZE(A) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + + daddu A,A,16*SIZE # 4mr*4kr + daddu B,B,8*SIZE # 2nr*4kr + +.L44: + MADD t11,t11,a4,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b6 + LD a1,1*SIZE(A) + + + MADD t12,t12,a4,b7 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b7 + LD b1,1*SIZE(B) + + daddiu K,K,-1 + daddu PREA,PREA,16*SIZE + + MADD t31,t31,a6,b6 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b6 + LD a3,3*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L48 + nop + +.L46: + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,3*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + + +.L48: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE + bnez M,.L40 + daddu CO2,CO2,4*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) + MUL t42, ALPHA, t42 + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L40 + nop +#endif + + + .align 3 +.L12_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L12_M1 + nop + +.L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#else + move B,BO + LD a0,0*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#endif + +.L51: # nr=2 mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + LD a5,3*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD b2,4*SIZE(B) + + MADD t12,t12,a4,b5 + LD a3,5*SIZE(A) + MADD t22,t22,a5,b5 + daddiu K,K,-1 + LD b3,5*SIZE(B) + + MADD t11,t11,a2,b2 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + LD b6,6*SIZE(B) + + MADD t12,t12,a2,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + LD a7,-1*SIZE(A) + MADD t22,t22,a3,b3 + LD b7,-1*SIZE(B) + + MADD t11,t11,a6,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b6 + LD b0,0*SIZE(B) + + MADD t12,t12,a6,b7 + LD a1,1*SIZE(A) + + MADD t22,t22,a7,b7 + bnez K,.L51 + LD b1,1*SIZE(B) + + +.L55: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L58 + nop + +.L56: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE # 2nr*2kr + LD a5,-1*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,-1*SIZE(B) + +.L57: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD b0,0*SIZE(B) + + MADD t12,t12,a4,b5 + LD a1,1*SIZE(A) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + +.L58: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) +#else + daddiu M, M, -1 + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L12_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop + nop + +.L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t22,t11 + beqz K,.L65 + nop + +#else + dsra K,KCO,2 + move B,BO # Reset B + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + beqz K,.L65 + MOV t22,t11 + +#endif + +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 + LD b4, 2*SIZE(B) + MADD t11,t11,a0,b0 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + + LD a2, 2*SIZE(A) # a3 + LD b2,4*SIZE(B) + MADD t11,t11,a4,b4 + + LD b3,5*SIZE(B) + MADD t12,t12,a4,b5 + + LD a6, 3*SIZE(A) # a4 + daddiu K,K,-1 + LD b6,6*SIZE(B) + MADD t11,t11,a2,b2 + + LD b7,7*SIZE(B) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + LD b0,0*SIZE(B) + MADD t11,t11,a6,b6 + + LD b1,1*SIZE(B) + bnez K,.L61 + MADD t12,t12,a6,b7 + + + +.L65: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L68 + nop + +.L66: + LD a4, 1*SIZE(A) # a1 + MADD t11,t11,a0,b0 + LD b4,2*SIZE(B) + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) # a0 + LD b0,0*SIZE(B) + MADD t11,t11,a4,b4 + + LD b1,1*SIZE(B) + MADD t12,t12,a4,b5 + + +.L68: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + +.L0_N2_Loop: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B + + + .align 5 +.L0_N1: + andi N,NCO,1 # nr = 1 + beqz N,.L999 + nop + + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + beqz M,.L11_M2 + daddu C,CO1,LDC + +.L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO # Reset B +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + LD a3,3*SIZE(A) + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 + beqz K,.L75 + nop +#else + move B, BO # Reset B + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + beqz K,.L75 + LD a3,3*SIZE(A) + +#endif + +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 + MADD t11,t11,a0,b0 + + LD a4, 4*SIZE(A) + MADD t21,t21,a1,b0 + + LD a5, 5*SIZE(A) + FETCH $0,(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + +.L72: + LD b2, 2*SIZE(B) # b2 + MADD t11,t11,a4,b4 + + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + + LD a1,9*SIZE(A) + FETCH $0,4*SIZE(PREA) + + LD a2,10*SIZE(A) + MADD t31,t31,a6,b4 + + LD a3,11*SIZE(A) + MADD t41,t41,a7,b4 + +.L73: + LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + + LD a4,12*SIZE(A) + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a5,13*SIZE(A) + MADD t21,t21,a1,b2 + + LD a6,14*SIZE(A) + FETCH $0,8*SIZE(PREA) + MADD t31,t31,a2,b2 + + LD a7,15*SIZE(A) + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + + LD a0,0*SIZE(A) + daddu PREA,PREA,16*SIZE + + LD a1,1*SIZE(A) + MADD t21,t21,a5,b6 + + LD a2,2*SIZE(A) + daddiu K,K,-1 + MADD t31,t31,a6,b6 + + LD a3,3*SIZE(A) + MADD t41,t41,a7,b6 + bnez K,.L71 + FETCH $0,-32(PREA) + + +.L75: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L78 + nop + +.L76: + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a4,4*SIZE(A) + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + LD a5,5*SIZE(A) + MADD t21,t21,a1,b0 + FETCH $0,0(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + MADD t11,t11,a4,b4 + + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + FETCH $0,4*SIZE(PREA) + + LD a1,1*SIZE(A) + MADD t31,t31,a6,b4 + + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + + LD a3,3*SIZE(A) + daddu PREA,PREA,8*SIZE + + + +.L78: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + bnez M,.L70 # M!=0 + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 + nop +#endif + + + .align 3 +.L11_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L11_M1 + nop + +.L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + + beqz K,.L85 + LD a1,1*SIZE(A) + +#endif + +.L81: # nr=1,mr=2,kr=4 + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + LD b2, 2*SIZE(B) + LD a2,4*SIZE(A) + MADD t11,t11,a4,b4 + LD a3,5*SIZE(A) + MADD t21,t21,a5,b4 + + LD b6, 3*SIZE(B) + LD a6,6*SIZE(A) + MADD t11,t11,a2,b2 + LD a7,7*SIZE(A) + MADD t21,t21,a3,b2 + + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD b0, 0*SIZE(B) + daddiu K,K,-1 + + LD a0,0*SIZE(A) + MADD t11,t11,a6,b6 + + LD a1,1*SIZE(A) + bnez K,.L81 + MADD t21,t21,a7,b6 + +.L85: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 + nop + +.L86: + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + LD b0,0(B) + LD a0,0*SIZE(A) + MADD t11,t11,a4,b4 + LD a1,1*SIZE(A) + MADD t21,t21,a5,b4 + + + +.L88: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + FETCH $0,0(CO1) + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L11_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L999 + nop + +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MTC $0,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + dsra K,KCO,2 + beqz K,.L95 + MTC $0,t11 +#endif + +.L91: # nr=mr=1,kr=4 + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) + MADD t11,t11,a4,b4 + + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) + MADD t11,t11,a2,b2 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MADD t11,t11,a6,b6 + + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L98 + nop + +.L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + MADD t11,t11,a4,b4 + +.L98: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + + +.L99: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + +#else + MUL t11, ALPHA, t11 + + ST t11, 0 * SIZE(CO1) +#endif + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/sgemm_kernel_loongson3b_4x4.S b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S new file mode 100644 index 000000000..4a8c9b0e4 --- /dev/null +++ b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S @@ -0,0 +1,2579 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # OFFSET is relate to the data part +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte + +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,C,LDC + + daddu PREB,BO,SPANB # PreB point next panelB + daddu CO3,CO2,LDC + + daddu PREA,AO,SPANA + daddu CO4,CO3,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + beqz M,.L14_M2 + daddu C,CO4,LDC # move C to next panel Cj + +.L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) +#else + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K # move A B to data part + daddu B,BO,TEMP +#endif + + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 4 # S=L,U=L +#else + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part +#endif + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + dsra K,KCO,2 # K=KCO/2 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 # clear 16 results registers +#endif + + .align 5 +.L11: # kr=4 + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + FETCH $0,(PREB) + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + LD b6,6*SIZE(B) + FETCH $0,(PREA) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + LD a7,7*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,7*SIZE(B) + +.L12: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,8*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,9*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,8*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,9*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,10*SIZE(A) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,10*SIZE(B) + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + LD a3,11*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + LD b3,11*SIZE(B) + +.L13: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,12*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,13*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,12*SIZE(B) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,13*SIZE(B) + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,14*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,16*SIZE # 4mr*4kr + LD b6,14*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,16*SIZE # 4nr*4kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L14: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + LD b0,0(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,16*SIZE + LD b1,1*SIZE(B) + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + bnez K,.L11 + LD b3,3*SIZE(B) + + +.L15: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP, 2 +#endif + beqz K,.L18 + nop + +.L16: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,8*SIZE # 4mr*2kr + LD b6,6*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,8*SIZE # 4nr*2kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L17: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,0*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,1*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREA,PREA,8*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + daddu PREB,PREB,8*SIZE + LD b3,3*SIZE(B) + + +.L18: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # 4mr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 4nr*kr + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COi += 4 + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + daddu PREB,BO,SPANB + + bnez M,.L10 + daddu CO4,CO4,4*SIZE + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) + MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) + MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + daddiu CO1,CO1, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 + nop +#endif + + + .align 3 +.L14_M2: + andi M, MCO, 2 # nr=4,mr=2 + beqz M,.L14_M1 + nop + +.L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 +#else + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 +#endif + dsra K,TEMP,2 + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 # clear 2*4=8 results registers + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + dsra K,KCO,2 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 + +#endif + +.L21: # nr=4,mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD a3,5*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,8*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,9*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,10*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,11*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddiu K,K,-1 + + MADD t11,t11,a2,b0 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b0 + LD a7,7*SIZE(A) + + MADD t12,t12,a2,b1 + LD b4,12*SIZE(B) + MADD t22,t22,a3,b1 + LD b5,13*SIZE(B) + + MADD t13,t13,a2,b2 + LD b6,14*SIZE(B) + MADD t23,t23,a3,b2 + LD b7,15*SIZE(B) + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + daddu A,A,8*SIZE # 2mr*4kr + daddu B,B,16*SIZE # 4nr*4kr + + MADD t11,t11,a6,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a6,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a7,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a6,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a7,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + + +.L25: +#ifndef TRMMKERNEL + andi K,KCO,2 # kr=2 +#else + andi K,TEMP,2 +#endif + beqz K,.L28 + nop + +.L26: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,4*SIZE # 2mr*2kr + daddu B,B,8*SIZE # 4nr*2kr + +.L27: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + +.L28: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write back part + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 + ST t23,1*SIZE(CO3) + daddu CO2,CO2,2*SIZE + + ST t14,0(CO4) + daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO4,CO4,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L14_M1: + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj + nop + +.L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + nop + beqz K,.L35 + nop + +#else + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + beqz K,.L35 + LD b3,3*SIZE(B) + +#endif + +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + + LD b6,6*SIZE(B) + LD b7,7*SIZE(B) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + LD a2, 2*SIZE(A) # a2 + MADD t11,t11,a1,b4 + + LD b0,8*SIZE(B) + LD b1,9*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,10*SIZE(B) + LD b3,11*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + LD a3, 3*SIZE(A) # a3 + MADD t11,t11,a2,b0 + daddiu K,K,-1 + + LD b4,12*SIZE(B) + LD b5,13*SIZE(B) + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr + + LD b6,14*SIZE(B) + LD b7,15*SIZE(B) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + + LD a0, 0*SIZE(A) # a0 + daddu B,B,16*SIZE # 4nr*4kr + MADD t11,t11,a3,b4 + + LD b0,0*SIZE(B) + MADD t12,t12,a3,b5 + LD b1,1*SIZE(B) + MADD t13,t13,a3,b6 + + LD b2,2*SIZE(B) + MADD t14,t14,a3,b7 + bnez K,.L31 + LD b3,3*SIZE(B) + + +.L35: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L38 + nop + +.L36: + LD a1,1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # mr*2kr + + LD b6,6*SIZE(B) + MADD t13,t13,a0,b2 + + LD b7,7*SIZE(B) + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE # 4nr*2kr + + +.L37: + LD a0,0(A) + MADD t11,t11,a1,b4 + + LD b0,0*SIZE(B) + LD b1,1*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + +.L38: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj + + .align 5 +.L0_N2: + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 + nop + +.L0_N2_Lb: + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + beqz M,.L12_M2 + nop + +.L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#endif + +.L41: # nr=2,mr=kr=4 + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + MADD t11,t11,a4,b4 + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,9*SIZE(A) + + MADD t12,t12,a4,b5 + LD b2,4*SIZE(B) + MADD t22,t22,a5,b5 + LD b3,5*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,10*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,11*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + MADD t11,t11,a0,b2 + LD a4,12*SIZE(A) + MADD t21,t21,a1,b2 + LD a5,13*SIZE(A) + + MADD t12,t12,a0,b3 + LD b6,6*SIZE(B) + MADD t22,t22,a1,b3 + LD b7,7*SIZE(B) + + MADD t31,t31,a2,b2 + LD a6,14*SIZE(A) + MADD t41,t41,a3,b2 + LD a7,15*SIZE(A) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + + daddu A,A,16*SIZE # 4mr*4kr + daddu B,B,8*SIZE # 2nr*4kr + +.L44: + MADD t11,t11,a4,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b6 + LD a1,1*SIZE(A) + + + MADD t12,t12,a4,b7 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b7 + LD b1,1*SIZE(B) + + daddiu K,K,-1 + daddu PREA,PREA,16*SIZE + + MADD t31,t31,a6,b6 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b6 + LD a3,3*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L48 + nop + +.L46: + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,3*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + + +.L48: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE + bnez M,.L40 + daddu CO2,CO2,4*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) + MUL t42, ALPHA, t42 + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L40 + nop +#endif + + + .align 3 +.L12_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L12_M1 + nop + +.L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#else + move B,BO + LD a0,0*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#endif + +.L51: # nr=2 mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + LD a5,3*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD b2,4*SIZE(B) + + MADD t12,t12,a4,b5 + LD a3,5*SIZE(A) + MADD t22,t22,a5,b5 + daddiu K,K,-1 + LD b3,5*SIZE(B) + + MADD t11,t11,a2,b2 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + LD b6,6*SIZE(B) + + MADD t12,t12,a2,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + LD a7,-1*SIZE(A) + MADD t22,t22,a3,b3 + LD b7,-1*SIZE(B) + + MADD t11,t11,a6,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b6 + LD b0,0*SIZE(B) + + MADD t12,t12,a6,b7 + LD a1,1*SIZE(A) + + MADD t22,t22,a7,b7 + bnez K,.L51 + LD b1,1*SIZE(B) + + +.L55: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L58 + nop + +.L56: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE # 2nr*2kr + LD a5,-1*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,-1*SIZE(B) + +.L57: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD b0,0*SIZE(B) + + MADD t12,t12,a4,b5 + LD a1,1*SIZE(A) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + +.L58: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) +#else + daddiu M, M, -1 + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L12_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop + nop + +.L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t22,t11 + beqz K,.L65 + nop + +#else + dsra K,KCO,2 + move B,BO # Reset B + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + beqz K,.L65 + MOV t22,t11 + +#endif + +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 + LD b4, 2*SIZE(B) + MADD t11,t11,a0,b0 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + + LD a2, 2*SIZE(A) # a3 + LD b2,4*SIZE(B) + MADD t11,t11,a4,b4 + + LD b3,5*SIZE(B) + MADD t12,t12,a4,b5 + + LD a6, 3*SIZE(A) # a4 + daddiu K,K,-1 + LD b6,6*SIZE(B) + MADD t11,t11,a2,b2 + + LD b7,7*SIZE(B) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + LD b0,0*SIZE(B) + MADD t11,t11,a6,b6 + + LD b1,1*SIZE(B) + bnez K,.L61 + MADD t12,t12,a6,b7 + + + +.L65: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L68 + nop + +.L66: + LD a4, 1*SIZE(A) # a1 + MADD t11,t11,a0,b0 + LD b4,2*SIZE(B) + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) # a0 + LD b0,0*SIZE(B) + MADD t11,t11,a4,b4 + + LD b1,1*SIZE(B) + MADD t12,t12,a4,b5 + + +.L68: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + +.L0_N2_Loop: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B + + + .align 5 +.L0_N1: + andi N,NCO,1 # nr = 1 + beqz N,.L999 + nop + + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + beqz M,.L11_M2 + daddu C,CO1,LDC + +.L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO # Reset B +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + LD a3,3*SIZE(A) + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 + beqz K,.L75 + nop +#else + move B, BO # Reset B + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + beqz K,.L75 + LD a3,3*SIZE(A) + +#endif + +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 + MADD t11,t11,a0,b0 + + LD a4, 4*SIZE(A) + MADD t21,t21,a1,b0 + + LD a5, 5*SIZE(A) + FETCH $0,(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + +.L72: + LD b2, 2*SIZE(B) # b2 + MADD t11,t11,a4,b4 + + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + + LD a1,9*SIZE(A) + FETCH $0,4*SIZE(PREA) + + LD a2,10*SIZE(A) + MADD t31,t31,a6,b4 + + LD a3,11*SIZE(A) + MADD t41,t41,a7,b4 + +.L73: + LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + + LD a4,12*SIZE(A) + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a5,13*SIZE(A) + MADD t21,t21,a1,b2 + + LD a6,14*SIZE(A) + FETCH $0,8*SIZE(PREA) + MADD t31,t31,a2,b2 + + LD a7,15*SIZE(A) + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + + LD a0,0*SIZE(A) + daddu PREA,PREA,16*SIZE + + LD a1,1*SIZE(A) + MADD t21,t21,a5,b6 + + LD a2,2*SIZE(A) + daddiu K,K,-1 + MADD t31,t31,a6,b6 + + LD a3,3*SIZE(A) + MADD t41,t41,a7,b6 + bnez K,.L71 + FETCH $0,-32(PREA) + + +.L75: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L78 + nop + +.L76: + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a4,4*SIZE(A) + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + LD a5,5*SIZE(A) + MADD t21,t21,a1,b0 + FETCH $0,0(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + MADD t11,t11,a4,b4 + + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + FETCH $0,4*SIZE(PREA) + + LD a1,1*SIZE(A) + MADD t31,t31,a6,b4 + + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + + LD a3,3*SIZE(A) + daddu PREA,PREA,8*SIZE + + + +.L78: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + bnez M,.L70 # M!=0 + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 + nop +#endif + + + .align 3 +.L11_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L11_M1 + nop + +.L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + + beqz K,.L85 + LD a1,1*SIZE(A) + +#endif + +.L81: # nr=1,mr=2,kr=4 + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + LD b2, 2*SIZE(B) + LD a2,4*SIZE(A) + MADD t11,t11,a4,b4 + LD a3,5*SIZE(A) + MADD t21,t21,a5,b4 + + LD b6, 3*SIZE(B) + LD a6,6*SIZE(A) + MADD t11,t11,a2,b2 + LD a7,7*SIZE(A) + MADD t21,t21,a3,b2 + + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD b0, 0*SIZE(B) + daddiu K,K,-1 + + LD a0,0*SIZE(A) + MADD t11,t11,a6,b6 + + LD a1,1*SIZE(A) + bnez K,.L81 + MADD t21,t21,a7,b6 + +.L85: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 + nop + +.L86: + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + LD b0,0(B) + LD a0,0*SIZE(A) + MADD t11,t11,a4,b4 + LD a1,1*SIZE(A) + MADD t21,t21,a5,b4 + + + +.L88: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + FETCH $0,0(CO1) + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L11_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L999 + nop + +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MTC $0,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + dsra K,KCO,2 + beqz K,.L95 + MTC $0,t11 +#endif + +.L91: # nr=mr=1,kr=4 + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) + MADD t11,t11,a4,b4 + + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) + MADD t11,t11,a2,b2 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MADD t11,t11,a6,b6 + + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L98 + nop + +.L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + MADD t11,t11,a4,b4 + +.L98: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + + +.L99: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + +#else + MUL t11, ALPHA, t11 + + ST t11, 0 * SIZE(CO1) +#endif + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/zgemm_kernel_loongson3b_2x2.S b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S new file mode 100644 index 000000000..5ded7aed0 --- /dev/null +++ b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S @@ -0,0 +1,1468 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) + + MOV c41, c11 + MOV c42, c11 + FETCH $0, 4 * SIZE(CO2) + + MOV c43, c11 + NOP + FETCH $0, 4 * SIZE(CO1) + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddiu AO, AO, 16 * SIZE + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + daddu PREB, PREB, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + NOP + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD1 c31, c31, a7, b7 # A1xB2 + NOP + + MADD3 c33, c33, a7, b8 + daddiu PREB, PREB, 16 * SIZE + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L36 + NOP + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu PREA, PREA, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + +# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + +# gsLQC1(R13, F13, F12, 2) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + +# gsLQC1(R12, F11, F10, 3) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + daddiu L, L, -1 + +# gsLQC1(R13, F16, F15, 3) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/param.h b/param.h index 610eb5fab..1cf08a3fa 100644 --- a/param.h +++ b/param.h @@ -1521,13 +1521,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 From c2dad58ad1ccbd1476827a1ccb615bd002248980 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Thu, 1 Dec 2011 16:33:11 +0000 Subject: [PATCH 7/9] Adding n32 multiple threads condition. --- common_linux.h | 6 +++++- common_mips64.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/common_linux.h b/common_linux.h index 45a688d23..8d9019a0d 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,8 +68,12 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { -#if defined (LOONGSON3B) +#if defined (LOONGSON3B) +#if defined (__64BIT__) return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#else + return 0; //NULL Implementation on Loongson 3B 32bit. +#endif #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 unsigned long null_nodemask=0; diff --git a/common_mips64.h b/common_mips64.h index 560f2c372..85348377e 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -120,6 +120,7 @@ static inline unsigned int rpcc(void){ } #if defined(LOONGSON3A) || defined(LOONGSON3B) +#ifndef NO_AFFINITY #define WHEREAMI static inline int WhereAmI(void){ int ret=0; @@ -131,6 +132,7 @@ static inline int WhereAmI(void){ } #endif +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; From 8e53b57bb26e4e9ac32cbb0b362a7946e2028944 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Tue, 10 Jan 2012 17:16:13 +0000 Subject: [PATCH 8/9] Appending gemmkernel and trmmkernel C code in kernel/generic, this code can be used to execute on a new platform which dose not have optimized assemble kernel. --- kernel/Makefile.L3 | 86 +++ kernel/generic/gemmkernel_2x2.c | 157 ++++++ kernel/generic/trmmkernel_2x2.c | 280 ++++++++++ kernel/generic/zgemmkernel_2x2.c | 838 ++++++++++++++++++++++++++++ kernel/generic/ztrmmkernel_2x2.c | 923 +++++++++++++++++++++++++++++++ kernel/mips64/KERNEL.LOONGSON3B | 20 +- 6 files changed, 2296 insertions(+), 8 deletions(-) create mode 100644 kernel/generic/gemmkernel_2x2.c create mode 100644 kernel/generic/trmmkernel_2x2.c create mode 100644 kernel/generic/zgemmkernel_2x2.c create mode 100644 kernel/generic/ztrmmkernel_2x2.c diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4e331a445..4f419dc80 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ +ifeq ($(TARGET), LOONGSON3B) +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c new file mode 100644 index 000000000..3645ef154 --- /dev/null +++ b/kernel/generic/gemmkernel_2x2.c @@ -0,0 +1,157 @@ +#include "common.h" +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; + for (j=0; j Date: Wed, 11 Jan 2012 16:05:39 +0000 Subject: [PATCH 9/9] Modify P Q R size of Loongson3b. --- param.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/param.h b/param.h index 1cf08a3fa..72d721d4e 100644 --- a/param.h +++ b/param.h @@ -1521,11 +1521,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -1534,19 +1534,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 44 -#define CGEMM_DEFAULT_P 64 -#define ZGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_P 24 +#define CGEMM_DEFAULT_P 24 +#define ZGEMM_DEFAULT_P 20 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 92 +#define DGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 80 +#define ZGEMM_DEFAULT_Q 64 -#define SGEMM_DEFAULT_R 1024 -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R 1024 -#define ZGEMM_DEFAULT_R 1024 +#define SGEMM_DEFAULT_R 512 +#define DGEMM_DEFAULT_R 512 +#define CGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 512 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000