diff --git a/Changelog.txt b/Changelog.txt index fb149ca7a..b54949ec5 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,7 +1,8 @@ OpenBLAS ChangeLog ==================================================================== -Version 0.1 alpha2(in development) -0;136;0c +Version 0.1 alpha2 +23-Jun-2011 + common: * Fixed blasint undefined bug in file. Other software could include this header successfully(Refs issue #13 on github) @@ -31,6 +32,8 @@ x86/x86_64: MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. + * Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2) + * Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3) ==================================================================== Version 0.1 alpha1 diff --git a/Makefile b/Makefile index c480fc47d..798c56192 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ ifeq ($(OSNAME), Darwin) endif ifeq ($(OSNAME), WINNT) $(MAKE) -C exports dll -# -ln -fs $(LIBDLLNAME) libopenblas.dll + -ln -fs $(LIBDLLNAME) libopenblas.dll endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll diff --git a/README b/README index 21e740689..9a7b16326 100644 --- a/README +++ b/README @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve 9.Known Issues * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. +* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). @@ -79,4 +80,4 @@ Now, there are 4 branches in github.com. * The master branch. This a main branch to reflect a production-ready state. * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. - * The gh-pages branch. This is for web pages \ No newline at end of file + * The gh-pages branch. This is for web pages diff --git a/common_mips64.h b/common_mips64.h index 7c7a70ba5..acea79011 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -220,6 +220,11 @@ REALNAME: ;\ #define BUFFER_SIZE ( 8 << 20) +#if defined(LOONGSON3A) +#define PAGESIZE (16UL << 10) +#define FIXED_PAGESIZE (16UL << 10) +#endif + #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 17d886e52..4fd4cd440 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -38,7 +38,7 @@ #include #include -#include +//#include #include "common.h" #ifndef USE_OPENMP diff --git a/exports/Makefile b/exports/Makefile index 6e067acbf..f4c9314f9 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -53,18 +53,19 @@ dyn : $(LIBDYNNAME) zip : dll zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) -dll : libgoto2.dll +dll : ../$(LIBDLLNAME) +#libgoto2.dll dll2 : libgoto2_shared.dll -libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) +../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) - $(DLLWRAP) -o $(@F) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) -lib /machine:i386 /def:libgoto2.def else - $(DLLWRAP) -o $(@F) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ --entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) -lib /machine:X64 /def:libgoto2.def endif diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index dd0d2cfea..ebb447b11 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -91,15 +91,37 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef STRSMKERNEL_LN STRSMKERNEL_LN = trsm_kernel_LN.S -STRSMKERNEL_LT = trsm_kernel_LT.S -STRSMKERNEL_RN = trsm_kernel_LT.S -STRSMKERNEL_RT = trsm_kernel_RT.S +endif +ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = trsm_kernel_RT.S +endif + +ifndef DTRSMKERNEL_LN DTRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef DTRSMKERNEL_LT DTRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RN DTRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RT DTRSMKERNEL_RT = trsm_kernel_RT.S +endif CTRSMKERNEL_LN = ztrsm_kernel_LT.S CTRSMKERNEL_LT = ztrsm_kernel_LT.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index b295070d9..e72ac142e 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,2 +1,24 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S + +SGEMMKERNEL = sgemm_kernel_loongson3a.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = gemm_kernel_loongson3a.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/gemm_kernel_loongson3a.S new file mode 100644 index 000000000..3e95a3ed4 --- /dev/null +++ b/kernel/mips64/gemm_kernel_loongson3a.S @@ -0,0 +1,2390 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # OFFSET is relate to the data part +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte + +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,C,LDC + + daddu PREB,BO,SPANB # PreB point next panelB + daddu CO3,CO2,LDC + + daddu PREA,AO,SPANA + daddu CO4,CO3,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + beqz M,.L14_M2 + daddu C,CO4,LDC # move C to next panel Cj + +.L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) +#else + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K # move A B to data part + daddu B,BO,TEMP +#endif + + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) # a0,a1 + + MOV t31,t11 + MOV t41,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t12,t11 + MOV t22,t11 + gsLQC1(R8,F3,F2,1) # a2,a3 + + MOV t32,t11 + MOV t42,t11 + gsLQC1(R9,F11,F10,1) # b2,b3 + + MOV t13,t11 + MOV t23,t11 + + MOV t33,t11 + MOV t43,t11 + + MOV t14,t11 + MOV t24,t11 + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 4 # S=L,U=L +#else + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part +#endif + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + gsLQC1(R8,F1,F0,0) # a0,a1 + + MOV t21,t11 + MOV t31,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t41,t11 + MOV t12,t11 + gsLQC1(R8,F3,F2,1) # a2,a3 + + MOV t22,t11 + MOV t32,t11 + gsLQC1(R9,F11,F10,1) # b2,b3 + + MOV t42,t11 + dsra K,KCO,2 # K=KCO/2 + + MOV t13,t11 + MOV t23,t11 + + MOV t33,t11 + MOV t43,t11 + + MOV t14,t11 + MOV t24,t11 + + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 # clear 16 results registers +#endif + + .align 5 +.L11: # kr=4 + gsLQC1(R8,F5,F4,2) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,3) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + + FETCH $0,(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + FETCH $0,(PREA) + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L12: + gsLQC1(R8,F1,F0,4) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,4) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,5) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + gsLQC1(R9,F11,F10,5) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + +.L13: + gsLQC1(R8,F5,F4,6) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,6) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,7) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,7) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu A,A,16*SIZE # 4mr*4kr + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + daddu B,B,16*SIZE # 4nr*4kr + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + FETCH $0,8*SIZE(PREA) + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L14: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + + gsLQC1(R9,F11,F10,1) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + FETCH $0,12*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + bnez K,.L11 + daddu PREA,PREA,16*SIZE + +.L15: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP, 2 +#endif + beqz K,.L18 + nop + +.L16: + gsLQC1(R8,F5,F4,2) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + gsLQC1(R9,F15,F14,3) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # 4mr*2kr + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + daddu B,B,8*SIZE # 4nr*2kr + + FETCH $0,0(PREA) + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L17: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + gsLQC1(R9,F11,F10,1) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + FETCH $0,4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddu PREB,PREB,8*SIZE + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREA,PREA,8*SIZE + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + +.L18: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # 4mr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 4nr*kr + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COi += 4 + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + daddu PREB,BO,SPANB + + bnez M,.L10 + daddu CO4,CO4,4*SIZE + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) + MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) + MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + daddiu CO1,CO1, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 + nop +#endif + + + .align 3 +.L14_M2: + andi M, MCO, 2 # nr=4,mr=2 + beqz M,.L14_M1 + nop + +.L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 + daddu A,A,K + daddu B,BO,TEMP +#endif + + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) # a0,a1 + + MOV t12,t11 + MOV t22,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t13,t11 + MOV t23,t11 + gsLQC1(R9,F11,F10,1) # b2,b3 + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 +#else + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 +#endif + dsra K,TEMP,2 + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 # clear 2*4=8 results registers + +#else + move B,BO # Reset B + MTC $0,t11 + gsLQC1(R8,F1,F0,0) + + MOV t21,t11 + MOV t12,t11 + gsLQC1(R9,F9,F8,0) + + MOV t22,t11 + dsra K,KCO,2 + gsLQC1(R9,F11,F10,1) + + MOV t13,t11 + MOV t23,t11 + + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 +#endif + +.L21: # nr=4,mr=2,kr=4 + gsLQC1(R8,F5,F4,1) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R9,F15,F14,3) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + gsLQC1(R8,F3,F2,2) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,4) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R9,F11,F10,5) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddiu K,K,-1 + + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a2,b0 + MADD t21,t21,a3,b0 + + gsLQC1(R9,F13,F12,6) + MADD t12,t12,a2,b1 + MADD t22,t22,a3,b1 + + gsLQC1(R9,F15,F14,7) + MADD t13,t13,a2,b2 + MADD t23,t23,a3,b2 + daddu A,A,8*SIZE # 2mr*4kr + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + daddu B,B,16*SIZE # 4nr*4kr + + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a6,b4 + MADD t21,t21,a7,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a6,b5 + MADD t22,t22,a7,b5 + + gsLQC1(R9,F11,F10,1) + MADD t13,t13,a6,b6 + MADD t23,t23,a7,b6 + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + +.L25: +#ifndef TRMMKERNEL + andi K,KCO,2 # kr=2 +#else + andi K,TEMP,2 +#endif + beqz K,.L28 + nop + +.L26: + gsLQC1(R8,F5,F4,1) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,2) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R9,F15,F14,3) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + daddu A,A,4*SIZE # 2mr*2kr + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu B,B,8*SIZE # 4nr*2kr + +.L27: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R9,F11,F10,1) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + +.L28: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write back part + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 + ST t23,1*SIZE(CO3) + daddu CO2,CO2,2*SIZE + + ST t14,0(CO4) + daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO4,CO4,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L14_M1: + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj + nop + +.L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 0 + BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + MTC $0,t11 + MOV t12,t11 + LD a0, 0 * SIZE(A) # a0 + + MOV t13,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t14,t11 # clear result registers + gsLQC1(R9,F11,F10,1) # b2,b3 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + nop + beqz K,.L35 + nop + +#else + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + MOV t12,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t13,t11 + MOV t14,t11 + gsLQC1(R9,F11,F10,1) # b2,b3 + + beqz K,.L35 + nop +#endif + +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + gsLQC1(R9,F13,F12,2) # b4,b5 + MADD t12,t12,a0,b1 + + gsLQC1(R9,F15,F14,3) # b6,b7 + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + LD a2, 2*SIZE(A) # a2 + MADD t11,t11,a1,b4 + + gsLQC1(R9,F9,F8,4) + MADD t12,t12,a1,b5 + + gsLQC1(R9,F11,F10,5) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + daddiu K,K,-1 + + LD a3, 3*SIZE(A) # a3 + MADD t11,t11,a2,b0 + + gsLQC1(R9,F13,F12,6) + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr + + gsLQC1(R9,F15,F14,7) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + daddu B,B,16*SIZE # 4nr*4kr + + LD a0, 0*SIZE(A) # a0 + MADD t11,t11,a3,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a3,b5 + + gsLQC1(R9,F11,F10,1) + MADD t13,t13,a3,b6 + bnez K,.L31 + MADD t14,t14,a3,b7 + +.L35: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L38 + nop + +.L36: + LD a1,1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + gsLQC1(R9,F13,F12,2) + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # mr*2kr + + gsLQC1(R9,F15,F14,3) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE # 4nr*2kr + + +.L37: + LD a0,0(A) + MADD t11,t11,a1,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a1,b5 + + gsLQC1(R9,F11,F10,1) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + +.L38: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, 0 + BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj + + .align 5 +.L0_N2: + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 + nop + +.L0_N2_Lb: + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + beqz M,.L12_M2 + nop + +.L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) # a0,a1 + + MOV t31,t11 + MOV t41,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t12,t11 + MOV t22,t11 + gsLQC1(R8,F3,F2,1) # a2,a3 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # gemm part + gsLQC1(R8,F1,F0,0) # a0,a1 + + MOV t21,t11 + MOV t31,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + + MOV t41,t11 + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R8,F3,F2,1) # a2,a3 + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 +#endif + +.L41: # nr=2,mr=kr=4 + gsLQC1(R8,F5,F4,2) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + gsLQC1(R8,F1,F0,4) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,2) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,5) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + gsLQC1(R8,F5,F4,6) + MADD t11,t11,a0,b2 + MADD t21,t21,a1,b2 + + gsLQC1(R9,F15,F14,3) + MADD t12,t12,a0,b3 + MADD t22,t22,a1,b3 + + gsLQC1(R8,F7,F6,7) + MADD t31,t31,a2,b2 + MADD t41,t41,a3,b2 + daddu B,B,8*SIZE # 2nr*4kr + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + daddu A,A,16*SIZE # 4mr*4kr + +.L44: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b6 + MADD t21,t21,a5,b6 + daddiu K,K,-1 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b7 + MADD t22,t22,a5,b7 + daddu PREA,PREA,16*SIZE + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b6 + MADD t41,t41,a7,b6 + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L48 + nop + +.L46: + gsLQC1(R8,F5,F4,2) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + +.L48: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE + bnez M,.L40 + daddu CO2,CO2,4*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) + MUL t42, ALPHA, t42 + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L40 + nop +#endif + + + .align 3 +.L12_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L12_M1 + nop + +.L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + MTC $0,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t21,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#else + move B,BO + dsra K,KCO,2 # K=KCO/2 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MTC $0,t11 + MOV t21,t11 + gsLQC1(R9,F9,F8,0) #b0,b1 + + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 +#endif + +.L51: # nr=2 mr=2,kr=4 + gsLQC1(R8,F5,F4,1) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + gsLQC1(R9,F13,F12,1) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + gsLQC1(R8,F3,F2,2) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F11,F10,2) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + daddiu K,K,-1 + + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a2,b2 + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + + gsLQC1(R9,F15,F14,3) + MADD t12,t12,a2,b3 + MADD t22,t22,a3,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a6,b6 + MADD t21,t21,a7,b6 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a6,b7 + bnez K,.L51 + MADD t22,t22,a7,b7 + +.L55: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L58 + nop + +.L56: + gsLQC1(R8,F5,F4,1) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + + gsLQC1(R9,F13,F12,1) + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 2nr*2kr + +.L57: + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + + +.L58: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) +#else + daddiu M, M, -1 + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L12_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop + nop + +.L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + MTC $0,t11 + LD a0, 0*SIZE(A) # a0 + + MOV t21,t11 + gsLQC1(R9,F9,F8,0) # b0,b1 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L65 + MOV t22,t11 + +#else + dsra K,KCO,2 + move B,BO # Reset B + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + gsLQC1(R9,F9,F8,0) + + MOV t12,t11 + beqz K,.L65 + MOV t22,t11 +#endif + +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 + MADD t11,t11,a0,b0 + + gsLQC1(R9,F13,F12,1) + MADD t12,t12,a0,b1 + + LD a2, 2*SIZE(A) # a3 + MADD t11,t11,a4,b4 + + gsLQC1(R9,F11,F10,2) + MADD t12,t12,a4,b5 + + LD a6, 3*SIZE(A) # a4 + MADD t11,t11,a2,b2 + daddiu K,K,-1 + + gsLQC1(R9,F15,F14,3) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + MADD t11,t11,a6,b6 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + gsLQC1(R9,F9,F8,0) # a0 + bnez K,.L61 + MADD t12,t12,a6,b7 + +.L65: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L68 + nop + +.L66: + LD a4, 1*SIZE(A) # a1 + MADD t11,t11,a0,b0 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + gsLQC1(R9,F13,F12,1) + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) # a0 + MADD t11,t11,a4,b4 + + gsLQC1(R9,F9,F8,0) + MADD t12,t12,a4,b5 + + +.L68: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + +.L0_N2_Loop: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B + + + .align 5 +.L0_N1: + andi N,NCO,1 # nr = 1 + beqz N,.L999 + nop + + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + beqz M,.L11_M2 + daddu C,CO1,LDC + +.L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO # Reset B +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + + MTC $0,t11 + LD b0, 0*SIZE(B) + + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t31,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 + MOV t41,t11 + beqz K,.L75 + nop +#else + move B, BO # Reset B + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + MOV t31,t11 + MOV t41,t11 + gsLQC1(R8,F3,F2,1) #a2,a3 + + beqz K,.L75 + nop +#endif + +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 + MADD t11,t11,a0,b0 + + gsLQC1(R8,F5,F4,2) + MADD t21,t21,a1,b0 + + gsLQC1(R8,F7,F6,3) + FETCH $0,(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + +.L72: + LD b2, 2*SIZE(B) # b2 + MADD t11,t11,a4,b4 + gsLQC1(R8,F1,F0,4) + MADD t21,t21,a5,b4 + + gsLQC1(R8,F3,F2,5) + FETCH $0,4*SIZE(PREA) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + +.L73: + LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + gsLQC1(R8,F5,F4,6) + MADD t21,t21,a1,b2 + FETCH $0,8*SIZE(PREA) + + gsLQC1(R8,F7,F6,7) + MADD t31,t31,a2,b2 + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + daddu PREA,PREA,16*SIZE + + gsLQC1(R8,F1,F0,0) + MADD t21,t21,a5,b6 + daddiu K,K,-1 + FETCH $0,-32(PREA) + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b6 + bnez K,.L71 + MADD t41,t41,a7,b6 + + +.L75: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L78 + nop + +.L76: + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + gsLQC1(R8,F5,F4,2) + MADD t21,t21,a1,b0 + FETCH $0,0(PREA) + + gsLQC1(R8,F7,F6,3) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + MADD t11,t11,a4,b4 + + gsLQC1(R8,F1,F0,0) + MADD t21,t21,a5,b4 + FETCH $0,4*SIZE(PREA) + + gsLQC1(R8,F3,F2,1) + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddu PREA,PREA,8*SIZE + + +.L78: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + bnez M,.L70 # M!=0 + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 + nop +#endif + + + .align 3 +.L11_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L11_M1 + nop + +.L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + + LD b0, 0*SIZE(B) + MTC $0,t11 + + gsLQC1(R8,F1,F0,0) #a0,a1 + MOV t21,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + gsLQC1(R8,F1,F0,0) #a0,a1 + + beqz K,.L85 + nop +#endif + +.L81: # nr=1,mr=2,kr=4 + LD b4, 1*SIZE(B) + gsLQC1(R8,F5,F4,1) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + LD b2, 2*SIZE(B) + gsLQC1(R8,F3,F2,2) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + LD b6, 3*SIZE(B) + gsLQC1(R8,F7,F6,3) + MADD t11,t11,a2,b2 + MADD t21,t21,a3,b2 + + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD b0, 0*SIZE(B) + gsLQC1(R8,F1,F0,0) + MADD t11,t11,a6,b6 + MADD t21,t21,a7,b6 + + daddiu K,K,-1 + bnez K,.L81 + nop + +.L85: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 + nop + +.L86: + gsLQC1(R8,F5,F4,1) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + gsLQC1(R8,F1,F0,0) + LD b0,0(B) + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + + +.L88: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + FETCH $0,0(CO1) + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L11_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L999 + nop + +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MTC $0,t11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + dsra K,KCO,2 + beqz K,.L95 + MTC $0,t11 +#endif + +.L91: # nr=mr=1,kr=4 + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) + MADD t11,t11,a4,b4 + + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) + MADD t11,t11,a2,b2 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MADD t11,t11,a6,b6 + + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L98 + nop + +.L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + MADD t11,t11,a4,b4 + +.L98: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + + +.L99: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + +#else + MUL t11, ALPHA, t11 + + ST t11, 0 * SIZE(CO1) +#endif + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a.S new file mode 100644 index 000000000..4a8c9b0e4 --- /dev/null +++ b/kernel/mips64/sgemm_kernel_loongson3a.S @@ -0,0 +1,2579 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # OFFSET is relate to the data part +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte + +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,C,LDC + + daddu PREB,BO,SPANB # PreB point next panelB + daddu CO3,CO2,LDC + + daddu PREA,AO,SPANA + daddu CO4,CO3,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + beqz M,.L14_M2 + daddu C,CO4,LDC # move C to next panel Cj + +.L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) +#else + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K # move A B to data part + daddu B,BO,TEMP +#endif + + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 4 # S=L,U=L +#else + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part +#endif + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + dsra K,KCO,2 # K=KCO/2 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 # clear 16 results registers +#endif + + .align 5 +.L11: # kr=4 + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + FETCH $0,(PREB) + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + LD b6,6*SIZE(B) + FETCH $0,(PREA) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + LD a7,7*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,7*SIZE(B) + +.L12: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,8*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,9*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,8*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,9*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,10*SIZE(A) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,10*SIZE(B) + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + LD a3,11*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + LD b3,11*SIZE(B) + +.L13: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,12*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,13*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,12*SIZE(B) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,13*SIZE(B) + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,14*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,16*SIZE # 4mr*4kr + LD b6,14*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,16*SIZE # 4nr*4kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L14: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + LD b0,0(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,16*SIZE + LD b1,1*SIZE(B) + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + bnez K,.L11 + LD b3,3*SIZE(B) + + +.L15: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP, 2 +#endif + beqz K,.L18 + nop + +.L16: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,8*SIZE # 4mr*2kr + LD b6,6*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,8*SIZE # 4nr*2kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L17: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,0*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,1*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREA,PREA,8*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + daddu PREB,PREB,8*SIZE + LD b3,3*SIZE(B) + + +.L18: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # 4mr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 4nr*kr + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COi += 4 + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + daddu PREB,BO,SPANB + + bnez M,.L10 + daddu CO4,CO4,4*SIZE + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) + MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) + MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + daddiu CO1,CO1, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 + nop +#endif + + + .align 3 +.L14_M2: + andi M, MCO, 2 # nr=4,mr=2 + beqz M,.L14_M1 + nop + +.L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 +#else + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 +#endif + dsra K,TEMP,2 + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 # clear 2*4=8 results registers + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + dsra K,KCO,2 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 + +#endif + +.L21: # nr=4,mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD a3,5*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,8*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,9*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,10*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,11*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddiu K,K,-1 + + MADD t11,t11,a2,b0 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b0 + LD a7,7*SIZE(A) + + MADD t12,t12,a2,b1 + LD b4,12*SIZE(B) + MADD t22,t22,a3,b1 + LD b5,13*SIZE(B) + + MADD t13,t13,a2,b2 + LD b6,14*SIZE(B) + MADD t23,t23,a3,b2 + LD b7,15*SIZE(B) + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + daddu A,A,8*SIZE # 2mr*4kr + daddu B,B,16*SIZE # 4nr*4kr + + MADD t11,t11,a6,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a6,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a7,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a6,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a7,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + + +.L25: +#ifndef TRMMKERNEL + andi K,KCO,2 # kr=2 +#else + andi K,TEMP,2 +#endif + beqz K,.L28 + nop + +.L26: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,4*SIZE # 2mr*2kr + daddu B,B,8*SIZE # 4nr*2kr + +.L27: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + +.L28: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write back part + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 + ST t23,1*SIZE(CO3) + daddu CO2,CO2,2*SIZE + + ST t14,0(CO4) + daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO4,CO4,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L14_M1: + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj + nop + +.L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + nop + beqz K,.L35 + nop + +#else + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + beqz K,.L35 + LD b3,3*SIZE(B) + +#endif + +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + + LD b6,6*SIZE(B) + LD b7,7*SIZE(B) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + LD a2, 2*SIZE(A) # a2 + MADD t11,t11,a1,b4 + + LD b0,8*SIZE(B) + LD b1,9*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,10*SIZE(B) + LD b3,11*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + LD a3, 3*SIZE(A) # a3 + MADD t11,t11,a2,b0 + daddiu K,K,-1 + + LD b4,12*SIZE(B) + LD b5,13*SIZE(B) + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr + + LD b6,14*SIZE(B) + LD b7,15*SIZE(B) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + + LD a0, 0*SIZE(A) # a0 + daddu B,B,16*SIZE # 4nr*4kr + MADD t11,t11,a3,b4 + + LD b0,0*SIZE(B) + MADD t12,t12,a3,b5 + LD b1,1*SIZE(B) + MADD t13,t13,a3,b6 + + LD b2,2*SIZE(B) + MADD t14,t14,a3,b7 + bnez K,.L31 + LD b3,3*SIZE(B) + + +.L35: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L38 + nop + +.L36: + LD a1,1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # mr*2kr + + LD b6,6*SIZE(B) + MADD t13,t13,a0,b2 + + LD b7,7*SIZE(B) + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE # 4nr*2kr + + +.L37: + LD a0,0(A) + MADD t11,t11,a1,b4 + + LD b0,0*SIZE(B) + LD b1,1*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + +.L38: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj + + .align 5 +.L0_N2: + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 + nop + +.L0_N2_Lb: + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + beqz M,.L12_M2 + nop + +.L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#endif + +.L41: # nr=2,mr=kr=4 + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + MADD t11,t11,a4,b4 + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,9*SIZE(A) + + MADD t12,t12,a4,b5 + LD b2,4*SIZE(B) + MADD t22,t22,a5,b5 + LD b3,5*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,10*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,11*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + MADD t11,t11,a0,b2 + LD a4,12*SIZE(A) + MADD t21,t21,a1,b2 + LD a5,13*SIZE(A) + + MADD t12,t12,a0,b3 + LD b6,6*SIZE(B) + MADD t22,t22,a1,b3 + LD b7,7*SIZE(B) + + MADD t31,t31,a2,b2 + LD a6,14*SIZE(A) + MADD t41,t41,a3,b2 + LD a7,15*SIZE(A) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + + daddu A,A,16*SIZE # 4mr*4kr + daddu B,B,8*SIZE # 2nr*4kr + +.L44: + MADD t11,t11,a4,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b6 + LD a1,1*SIZE(A) + + + MADD t12,t12,a4,b7 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b7 + LD b1,1*SIZE(B) + + daddiu K,K,-1 + daddu PREA,PREA,16*SIZE + + MADD t31,t31,a6,b6 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b6 + LD a3,3*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L48 + nop + +.L46: + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,3*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + + +.L48: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE + bnez M,.L40 + daddu CO2,CO2,4*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) + MUL t42, ALPHA, t42 + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L40 + nop +#endif + + + .align 3 +.L12_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L12_M1 + nop + +.L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#else + move B,BO + LD a0,0*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#endif + +.L51: # nr=2 mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + LD a5,3*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD b2,4*SIZE(B) + + MADD t12,t12,a4,b5 + LD a3,5*SIZE(A) + MADD t22,t22,a5,b5 + daddiu K,K,-1 + LD b3,5*SIZE(B) + + MADD t11,t11,a2,b2 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + LD b6,6*SIZE(B) + + MADD t12,t12,a2,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + LD a7,-1*SIZE(A) + MADD t22,t22,a3,b3 + LD b7,-1*SIZE(B) + + MADD t11,t11,a6,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b6 + LD b0,0*SIZE(B) + + MADD t12,t12,a6,b7 + LD a1,1*SIZE(A) + + MADD t22,t22,a7,b7 + bnez K,.L51 + LD b1,1*SIZE(B) + + +.L55: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L58 + nop + +.L56: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE # 2nr*2kr + LD a5,-1*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,-1*SIZE(B) + +.L57: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD b0,0*SIZE(B) + + MADD t12,t12,a4,b5 + LD a1,1*SIZE(A) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + +.L58: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) +#else + daddiu M, M, -1 + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L12_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop + nop + +.L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t22,t11 + beqz K,.L65 + nop + +#else + dsra K,KCO,2 + move B,BO # Reset B + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + beqz K,.L65 + MOV t22,t11 + +#endif + +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 + LD b4, 2*SIZE(B) + MADD t11,t11,a0,b0 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + + LD a2, 2*SIZE(A) # a3 + LD b2,4*SIZE(B) + MADD t11,t11,a4,b4 + + LD b3,5*SIZE(B) + MADD t12,t12,a4,b5 + + LD a6, 3*SIZE(A) # a4 + daddiu K,K,-1 + LD b6,6*SIZE(B) + MADD t11,t11,a2,b2 + + LD b7,7*SIZE(B) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + LD b0,0*SIZE(B) + MADD t11,t11,a6,b6 + + LD b1,1*SIZE(B) + bnez K,.L61 + MADD t12,t12,a6,b7 + + + +.L65: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L68 + nop + +.L66: + LD a4, 1*SIZE(A) # a1 + MADD t11,t11,a0,b0 + LD b4,2*SIZE(B) + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) # a0 + LD b0,0*SIZE(B) + MADD t11,t11,a4,b4 + + LD b1,1*SIZE(B) + MADD t12,t12,a4,b5 + + +.L68: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + +.L0_N2_Loop: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B + + + .align 5 +.L0_N1: + andi N,NCO,1 # nr = 1 + beqz N,.L999 + nop + + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + beqz M,.L11_M2 + daddu C,CO1,LDC + +.L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO # Reset B +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + LD a3,3*SIZE(A) + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 + beqz K,.L75 + nop +#else + move B, BO # Reset B + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + beqz K,.L75 + LD a3,3*SIZE(A) + +#endif + +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 + MADD t11,t11,a0,b0 + + LD a4, 4*SIZE(A) + MADD t21,t21,a1,b0 + + LD a5, 5*SIZE(A) + FETCH $0,(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + +.L72: + LD b2, 2*SIZE(B) # b2 + MADD t11,t11,a4,b4 + + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + + LD a1,9*SIZE(A) + FETCH $0,4*SIZE(PREA) + + LD a2,10*SIZE(A) + MADD t31,t31,a6,b4 + + LD a3,11*SIZE(A) + MADD t41,t41,a7,b4 + +.L73: + LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + + LD a4,12*SIZE(A) + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a5,13*SIZE(A) + MADD t21,t21,a1,b2 + + LD a6,14*SIZE(A) + FETCH $0,8*SIZE(PREA) + MADD t31,t31,a2,b2 + + LD a7,15*SIZE(A) + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + + LD a0,0*SIZE(A) + daddu PREA,PREA,16*SIZE + + LD a1,1*SIZE(A) + MADD t21,t21,a5,b6 + + LD a2,2*SIZE(A) + daddiu K,K,-1 + MADD t31,t31,a6,b6 + + LD a3,3*SIZE(A) + MADD t41,t41,a7,b6 + bnez K,.L71 + FETCH $0,-32(PREA) + + +.L75: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L78 + nop + +.L76: + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a4,4*SIZE(A) + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + LD a5,5*SIZE(A) + MADD t21,t21,a1,b0 + FETCH $0,0(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + MADD t11,t11,a4,b4 + + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + FETCH $0,4*SIZE(PREA) + + LD a1,1*SIZE(A) + MADD t31,t31,a6,b4 + + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + + LD a3,3*SIZE(A) + daddu PREA,PREA,8*SIZE + + + +.L78: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + bnez M,.L70 # M!=0 + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 + nop +#endif + + + .align 3 +.L11_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L11_M1 + nop + +.L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + + beqz K,.L85 + LD a1,1*SIZE(A) + +#endif + +.L81: # nr=1,mr=2,kr=4 + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + LD b2, 2*SIZE(B) + LD a2,4*SIZE(A) + MADD t11,t11,a4,b4 + LD a3,5*SIZE(A) + MADD t21,t21,a5,b4 + + LD b6, 3*SIZE(B) + LD a6,6*SIZE(A) + MADD t11,t11,a2,b2 + LD a7,7*SIZE(A) + MADD t21,t21,a3,b2 + + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD b0, 0*SIZE(B) + daddiu K,K,-1 + + LD a0,0*SIZE(A) + MADD t11,t11,a6,b6 + + LD a1,1*SIZE(A) + bnez K,.L81 + MADD t21,t21,a7,b6 + +.L85: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 + nop + +.L86: + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + LD b0,0(B) + LD a0,0*SIZE(A) + MADD t11,t11,a4,b4 + LD a1,1*SIZE(A) + MADD t21,t21,a5,b4 + + + +.L88: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + FETCH $0,0(CO1) + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L11_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L999 + nop + +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MTC $0,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + dsra K,KCO,2 + beqz K,.L95 + MTC $0,t11 +#endif + +.L91: # nr=mr=1,kr=4 + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) + MADD t11,t11,a4,b4 + + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) + MADD t11,t11,a2,b2 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MADD t11,t11,a6,b6 + + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L98 + nop + +.L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + MADD t11,t11,a4,b4 + +.L98: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + + +.L99: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + +#else + MUL t11, ALPHA, t11 + + ST t11, 0 * SIZE(CO1) +#endif + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_LN_loongson3a.S b/kernel/mips64/trsm_kernel_LN_loongson3a.S new file mode 100644 index 000000000..aba86fbce --- /dev/null +++ b/kernel/mips64/trsm_kernel_LN_loongson3a.S @@ -0,0 +1,1938 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + +#define t11 $f16 +#define t21 $f17 +#define t31 $f18 +#define t41 $f19 + +#define t12 $f20 +#define t22 $f21 +#define t32 $f22 +#define t42 $f23 + +#define t13 $f24 +#define t23 $f25 +#define t33 $f26 +#define t43 $f27 + +#define t14 $f28 +#define t24 $f29 +#define t34 $f30 +#define t44 $f31 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + # LN compute from bottom to top + LDARG OFFSET, 144($sp) + dsll LDC, LDC, BASE_SHIFT # ldc + + mult M, K + mflo TEMP # TEMP=MC*KC + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP # A move to the end of sa + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP # C+=MC + + dsra J, N, 2 # j = nc/4 + blez J, .L30 + nop + +.L10: # nr=4 + daddiu J, J, -1 + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai + move AORIG, A # reset A + + daddu C, CO4, LDC # fixed pointer C, the write back address + + andi I, M, 1 # mr=2,nr=4 + blez I, .L50 + nop + + dsll TEMP, K, BASE_SHIFT # mr=1 + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t13, t11 # mr=2 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L55 + nop + + + .align 3 +.L52: + LD a5, 1 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L52 + nop + + + .align 3 +.L55: + andi L, TEMP, 3 + blez L, .L58 + nop + + .align 3 +.L56: + MADD t11, t11, a1, b1 # 3rd compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + nop + + +.L58: # deal with the triangular part + daddiu TEMP, KK, -1 + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + MUL t13, b3, t13 + MUL t14, b3, t14 + + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + + daddiu KK, KK, -1 # the length of rectangular data part increases by 1 + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + + +.L50: + andi I, M, 2 # mr=2,nr=4 + blez I, .L20 + nop + + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t13, t11 # mr=2 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L25 + nop + + + .align 3 +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t21, t21, a4, b1 + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t21, t21, a8, b5 + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + nop + + + .align 3 +.L25: + andi L, TEMP, 3 + blez L, .L28 + nop + + .align 3 +.L26: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + nop + + +.L28: # deal with the triangular part + daddiu TEMP, KK, -2 + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + + LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b2, 2 * SIZE(AO) + MUL t21, b1, t21 + MUL t22, b1, t22 + MUL t23, b1, t23 + MUL t24, b1, t24 + NMSUB t11, t11, b2, t21 + NMSUB t12, t12, b2, t22 + NMSUB t13, t13, b2, t23 + NMSUB t14, t14, b2, t24 + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + MUL t13, b3, t13 + MUL t14, b3, t14 + + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + + + daddiu KK, KK, -2 # the length of rectangular data part increases by 2 + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + +.L20: + dsra I, M, 2 # I=MC/4 + blez I, .L29 + nop + +.L11: # mr=4 + dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai + dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai + dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MOV t13, t11 # clear result registers + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L15 + nop + + .align 3 +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 # 4th compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + daddiu L, L, -1 + bgtz L, .L12 + nop + + + .align 3 +.L15: + andi L, TEMP, 3 + blez L, .L18 + nop + + .align 3 +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + nop + + +.L18: # deal with the triangular data part of panel Ai + daddiu TEMP, KK, -4 # + + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L # AO point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B + LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + LD b5, 4 * SIZE(BO) # sb store in row major + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + SUB t31, b1, t31 + SUB t32, b2, t32 + SUB t33, b3, t33 + SUB t34, b4, t34 + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + SUB t41, b5, t41 + SUB t42, b6, t42 + SUB t43, b7, t43 + SUB t44, b8, t44 + + + LD b1, 15 * SIZE(AO) + LD b2, 14 * SIZE(AO) + LD b4, 13 * SIZE(AO) + LD b7, 12 * SIZE(AO) + + MUL t41, b1, t41 + MUL t42, b1, t42 + MUL t43, b1, t43 + MUL t44, b1, t44 + NMSUB t31, t31, b2, t41 + NMSUB t32, t32, b2, t42 + NMSUB t33, t33, b2, t43 + NMSUB t34, t34, b2, t44 + NMSUB t21, t21, b4, t41 + NMSUB t22, t22, b4, t42 + NMSUB t23, t23, b4, t43 + NMSUB t24, t24, b4, t44 + NMSUB t11, t11, b7, t41 + NMSUB t12, t12, b7, t42 + NMSUB t13, t13, b7, t43 + NMSUB t14, t14, b7, t44 + + + + LD b3, 10 * SIZE(AO) + LD b5, 9 * SIZE(AO) + LD b8, 8 * SIZE(AO) + MUL t31, b3, t31 + MUL t32, b3, t32 + MUL t33, b3, t33 + MUL t34, b3, t34 + NMSUB t21, t21, b5, t31 + NMSUB t22, t22, b5, t32 + NMSUB t23, t23, b5, t33 + NMSUB t24, t24, b5, t34 + NMSUB t11, t11, b8, t31 + NMSUB t12, t12, b8, t32 + NMSUB t13, t13, b8, t33 + NMSUB t14, t14, b8, t34 + + + + LD b6, 5 * SIZE(AO) + LD b1, 4 * SIZE(AO) + MUL t21, b6, t21 + MUL t22, b6, t22 + MUL t23, b6, t23 + MUL t24, b6, t24 + NMSUB t11, t11, b1, t21 + NMSUB t12, t12, b1, t22 + NMSUB t13, t13, b1, t23 + NMSUB t14, t14, b1, t24 + + + + LD b2, 0 * SIZE(AO) + MUL t11, b2, t11 + MUL t12, b2, t12 + MUL t13, b2, t13 + MUL t14, b2, t14 + + daddiu CO1, CO1, -4 * SIZE # modify + daddiu CO2, CO2, -4 * SIZE + daddiu CO3, CO3, -4 * SIZE + daddiu CO4, CO4, -4 * SIZE + + + ST t11, 0 * SIZE(BO) # update packed B + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + ST t31, 8 * SIZE(BO) + ST t32, 9 * SIZE(BO) + ST t33, 10 * SIZE(BO) + ST t34, 11 * SIZE(BO) + ST t41, 12 * SIZE(BO) + ST t42, 13 * SIZE(BO) + ST t43, 14 * SIZE(BO) + ST t44, 15 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + + daddiu KK, KK, -4 # KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4 + daddiu I, I, -1 + + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + MOV t12, a1 + MOV t22, a1 + MOV t32, a1 + MOV t42, a1 + bgtz I, .L11 + nop + + .align 3 + +.L29: + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP # B point to next Bj + + bgtz J, .L10 + nop + + + .align 3 +.L30: + andi J, N, 2 # nr=2 + blez J, .L70 + nop + + move CO1, C + daddu CO2, C, LDC + + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + daddu KK, M, OFFSET + move AORIG, A # reset A + + daddu C, CO2, LDC # fixed + + andi I, M, 1 # mr=1 + blez I, .L60 + nop + + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + + daddu AO, AORIG, L # AO point to rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t12, t11 # clear result registers + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L65 + nop + + + .align 3 +.L62: + LD a5, 1 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L62 + nop + + .align 3 + +.L65: + andi L, TEMP, 3 + blez L, .L68 + nop + + .align 3 +.L66: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 1 * SIZE # AO += mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + nop + +.L68: + daddiu TEMP, KK, -1 # mr=1 + + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + + daddiu KK, KK, -1 + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + + + +.L60: + andi I, M, 2 + blez I, .L40 + nop + + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + + daddu AO, AORIG, L # AO point to rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + + MOV t12, t11 # clear result registers + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L45 + nop + + + .align 3 +.L42: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L42 + nop + + .align 3 + +.L45: + andi L, TEMP, 3 + blez L, .L48 + nop + + .align 3 +.L46: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + nop + +.L48: + daddiu TEMP, KK, -2 + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L # Ao point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + + LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b2, 2 * SIZE(AO) + MUL t21, b1, t21 + MUL t22, b1, t22 + NMSUB t11, t11, b2, t21 + NMSUB t12, t12, b2, t22 + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + MUL t12, b3, t12 + + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + + daddiu KK, KK, -2 + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + +.L40: + dsra I, M, 2 # I = mc/4 + blez I, .L49 + nop + +.L31: + dsll TEMP, K, 2 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(BO) # get 4b + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L35 + nop + + + .align 3 +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L32 + nop + + + .align 3 + +.L35: + andi L, TEMP, 3 + blez L, .L38 + nop + + .align 3 +.L36: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + nop + + +.L38: # + daddiu TEMP, KK, -4 + dsll L, TEMP, 2 + BASE_SHIFT # mr=4 + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L # AO point to the triangular data part + daddu BO, B, TEMP + + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + SUB t31, b5, t31 + SUB t32, b6, t32 + SUB t41, b7, t41 + SUB t42, b8, t42 + + + LD b1, 15 * SIZE(AO) + LD b2, 14 * SIZE(AO) + LD b4, 13 * SIZE(AO) + LD b7, 12 * SIZE(AO) + + MUL t41, b1, t41 + MUL t42, b1, t42 + NMSUB t31, t31, b2, t41 + NMSUB t32, t32, b2, t42 + NMSUB t21, t21, b4, t41 + NMSUB t22, t22, b4, t42 + NMSUB t11, t11, b7, t41 + NMSUB t12, t12, b7, t42 + + + + LD b3, 10 * SIZE(AO) + LD b5, 9 * SIZE(AO) + LD b8, 8 * SIZE(AO) + MUL t31, b3, t31 + MUL t32, b3, t32 + NMSUB t21, t21, b5, t31 + NMSUB t22, t22, b5, t32 + NMSUB t11, t11, b8, t31 + NMSUB t12, t12, b8, t32 + + + + LD b6, 5 * SIZE(AO) + LD b1, 4 * SIZE(AO) + MUL t21, b6, t21 + MUL t22, b6, t22 + NMSUB t11, t11, b1, t21 + NMSUB t12, t12, b1, t22 + + + LD b2, 0 * SIZE(AO) + MUL t11, b2, t11 + MUL t12, b2, t12 + + daddiu CO1, CO1, -4 * SIZE + daddiu CO2, CO2, -4 * SIZE + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + ST t31, 4 * SIZE(BO) + ST t32, 5 * SIZE(BO) + ST t41, 6 * SIZE(BO) + ST t42, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + + daddiu KK, KK, -4 + + MTC $0, t11 + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + daddiu I, I, -1 + bgtz I, .L31 + nop + + + + .align 3 +.L49: + dsll TEMP, K, 1 + BASE_SHIFT # nr=2 + daddu B, B, TEMP + + .align 3 + +.L70: + andi J, N, 1 # nr=1 + blez J, .L999 # END + nop + + move CO1, C + + daddu KK, M, OFFSET + move AORIG, A # reset A + + andi I, M, 1 # mr=1 + blez I, .L90 + NOP + + MTC $0, t11 + + dsll TEMP, K, BASE_SHIFT # mr=1 + dsubu AORIG, AORIG, TEMP + + dsll L, KK, BASE_SHIFT + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, L + + dsubu TEMP, K, KK + + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L95 + nop + + .align 3 +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + + daddiu L, L, -1 + bgtz L, .L92 + nop + + .align 3 + +.L95: + andi L, TEMP, 3 + blez L, .L98 + nop + + .align 3 +.L96: + MADD t11, t11, a1, b1 # 3rd compute + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + nop + + +.L98: + daddiu TEMP, KK, -1 # mr=2 + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AORIG, TEMP # AO point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + + SUB t11, b1, t11 + + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + + daddiu CO1, CO1, -1 * SIZE + + ST t11, 0 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + + daddiu KK, KK, -1 + + +.L90: + andi I, M, 2 + blez I, .L80 + NOP + + MTC $0, t11 + MOV t21, t11 # clear result registers + + dsll TEMP, K, 1+BASE_SHIFT # mr=2 + dsubu AORIG, AORIG, TEMP + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L # AO point to the rectangular data part + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + nop + + .align 3 +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + nop + + .align 3 + +.L85: + andi L, TEMP, 3 + blez L, .L88 + nop + + .align 3 +.L86: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + nop + + +.L88: + daddiu TEMP, KK, -2 # mr=2 + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu AO, AORIG, L # AO point to the triangular data part + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b2, 2 * SIZE(AO) + MUL t21, b1, t21 + NMSUB t11, t11, b2, t21 + + LD b3, 0 * SIZE(AO) + MUL t11, b3, t11 + + daddiu CO1, CO1, -2 * SIZE + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + + daddiu KK, KK, -2 + + + .align 3 +.L80: + dsra I, M, 2 + blez I, .L89 + nop + +.L71: + dsll TEMP, K, 2 + BASE_SHIFT # mr=4 + dsubu AORIG, AORIG, TEMP + + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 0 + BASE_SHIFT # nr=1 + + daddu AO, AORIG, L # AO point to the rectangular + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(BO) # get 4b + + dsra L, TEMP, 2 + blez L, .L75 + nop # reset B + + .align 3 +.L72: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L72 + nop + + .align 3 + +.L75: + andi L, TEMP, 3 + blez L, .L78 + nop + + .align 3 +.L76: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + nop + +.L78: + daddiu TEMP, KK, -4 # mr=4 + + dsll L, TEMP, 2 + BASE_SHIFT # mr=4 + dsll TEMP, TEMP, 0 + BASE_SHIFT # nr=1 + daddu AO, AORIG, L # AO point to the triangular + daddu BO, B, TEMP + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b1, 15 * SIZE(AO) + LD b2, 14 * SIZE(AO) + LD b4, 13 * SIZE(AO) + LD b7, 12 * SIZE(AO) + MUL t41, b1, t41 + NMSUB t31, t31, b2, t41 + NMSUB t21, t21, b4, t41 + NMSUB t11, t11, b7, t41 + + + + LD b3, 10 * SIZE(AO) + LD b5, 9 * SIZE(AO) + LD b8, 8 * SIZE(AO) + MUL t31, b3, t31 + NMSUB t21, t21, b5, t31 + NMSUB t11, t11, b8, t31 + + + + LD b6, 5 * SIZE(AO) + LD b1, 4 * SIZE(AO) + MUL t21, b6, t21 + NMSUB t11, t11, b1, t21 + + + + LD b2, 0 * SIZE(AO) + MUL t11, b2, t11 + + daddiu CO1, CO1, -4 * SIZE + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + ST t31, 2 * SIZE(BO) + ST t41, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + + daddiu KK, KK, -4 + daddiu I, I, -1 + bgtz I, .L71 + nop + + + .align 3 +.L89: + dsll TEMP, K, BASE_SHIFT # nr=1 + daddu B, B, TEMP + + + + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_LT_loongson3a.S b/kernel/mips64/trsm_kernel_LT_loongson3a.S new file mode 100644 index 000000000..4114d94ef --- /dev/null +++ b/kernel/mips64/trsm_kernel_LT_loongson3a.S @@ -0,0 +1,1783 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + +#define t11 $f16 +#define t21 $f17 +#define t31 $f18 +#define t41 $f19 + +#define t12 $f20 +#define t22 $f21 +#define t32 $f22 +#define t42 $f23 + +#define t13 $f24 +#define t23 $f25 +#define t33 $f26 +#define t43 $f27 + +#define t14 $f28 +#define t24 $f29 +#define t34 $f30 +#define t44 $f31 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + # LT compute from left to right, top to bottom + LDARG OFFSET, 144($sp) + dsll LDC, LDC, BASE_SHIFT # ldc + + dsra J, N, 2 # j = nc/4 + blez J, .L30 + nop + +.L10: # nr=4 + daddiu J, J, -1 + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + MTC $0, t11 # clear result registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + dsra I, M, 2 # i = mc/4 + move KK, OFFSET # kk is the length of the rectangular data part of panel Ai + move AO, A # reset A + daddu C, CO4, LDC # fixed pointer C, the write back address + blez I, .L20 + nop + + +.L11: # mr=4 + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + MOV t13, t11 # clear result registers + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + dsra L, KK, 2 # L = kk/4 + blez L, .L15 + move BO, B # + + + .align 3 +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 # 4th compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 + + daddiu L, L, -1 + bgtz L, .L12 + nop + + + .align 3 +.L15: + andi L, KK, 3 # the remainder part: KK-KK/4 + blez L, .L18 + nop + + .align 3 +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + nop + + +.L18: # deal with the triangular data part of panel Ai + LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B + LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + LD b5, 4 * SIZE(BO) # sb store in row major + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + SUB t31, b1, t31 + SUB t32, b2, t32 + SUB t33, b3, t33 + SUB t34, b4, t34 + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + SUB t41, b5, t41 + SUB t42, b6, t42 + SUB t43, b7, t43 + SUB t44, b8, t44 + + + LD a1, 0 * SIZE(AO) # sa stores in col major + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MUL t11, a1, t11 + MUL t12, a1, t12 + MUL t13, a1, t13 + MUL t14, a1, t14 + NMSUB t21, t21, a2, t11 + NMSUB t22, t22, a2, t12 + NMSUB t23, t23, a2, t13 + NMSUB t24, t24, a2, t14 + NMSUB t31, t31, a3, t11 + NMSUB t32, t32, a3, t12 + NMSUB t33, t33, a3, t13 + NMSUB t34, t34, a3, t14 + NMSUB t41, t41, a4, t11 + NMSUB t42, t42, a4, t12 + NMSUB t43, t43, a4, t13 + NMSUB t44, t44, a4, t14 + + + LD a5, 5 * SIZE(AO) + LD a6, 6 * SIZE(AO) + LD a7, 7 * SIZE(AO) + MUL t21, a5, t21 + MUL t22, a5, t22 + MUL t23, a5, t23 + MUL t24, a5, t24 + NMSUB t31, t31, a6, t21 + NMSUB t32, t32, a6, t22 + NMSUB t33, t33, a6, t23 + NMSUB t34, t34, a6, t24 + NMSUB t41, t41, a7, t21 + NMSUB t42, t42, a7, t22 + NMSUB t43, t43, a7, t23 + NMSUB t44, t44, a7, t24 + + + LD a8, 10 * SIZE(AO) + LD a1, 11 * SIZE(AO) + MUL t31, a8, t31 + MUL t32, a8, t32 + MUL t33, a8, t33 + MUL t34, a8, t34 + NMSUB t41, t41, a1, t31 + NMSUB t42, t42, a1, t32 + NMSUB t43, t43, a1, t33 + NMSUB t44, t44, a1, t34 + + + LD a2, 15 * SIZE(AO) + MUL t41, a2, t41 + MUL t42, a2, t42 + MUL t43, a2, t43 + MUL t44, a2, t44 + + ST t11, 0 * SIZE(BO) # update packed B + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + ST t31, 8 * SIZE(BO) + ST t32, 9 * SIZE(BO) + ST t33, 10 * SIZE(BO) + ST t34, 11 * SIZE(BO) + ST t41, 12 * SIZE(BO) + ST t42, 13 * SIZE(BO) + ST t43, 14 * SIZE(BO) + ST t44, 15 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE # fixed pointers + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # mov AO to the end of panel Ai + daddu BO, BO, TEMP # mov BO to the end of panel Bj + + daddiu KK, KK, 4 # the length of rectangular data part increases by 4 + daddiu I, I, -1 + + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + MOV t12, a1 + MOV t22, a1 + MOV t32, a1 + MOV t42, a1 + bgtz I, .L11 + nop + + + .align 3 +.L20: + andi I, M, 2 # mr=2,nr=4 + blez I, .L50 + nop + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 + blez L, .L25 + move BO, B + + + .align 3 +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t21, t21, a4, b1 + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t21, t21, a8, b5 + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + nop + + + .align 3 +.L25: + andi L, KK, 3 + blez L, .L28 + nop + + .align 3 +.L26: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + nop + + +.L28: # deal with the triangular part + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + SUB t21, b5, t21 + SUB t22, b6, t22 + SUB t23, b7, t23 + SUB t24, b8, t24 + + + LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b2, 1 * SIZE(AO) + MUL t11, b1, t11 + MUL t12, b1, t12 + MUL t13, b1, t13 + MUL t14, b1, t14 + NMSUB t21, t21, b2, t11 + NMSUB t22, t22, b2, t12 + NMSUB t23, t23, b2, t13 + NMSUB t24, t24, b2, t14 + + LD b3, 3 * SIZE(AO) + MUL t21, b3, t21 + MUL t22, b3, t22 + MUL t23, b3, t23 + MUL t24, b3, t24 + + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + ST t21, 4 * SIZE(BO) + ST t22, 5 * SIZE(BO) + ST t23, 6 * SIZE(BO) + ST t24, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # mov AO to the end of Ai + daddu BO, BO, TEMP # mov BO to the end of Bj + + daddiu KK, KK, 2 # the length of rectangular data part increases by 2 + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + MOV t12, a1 + MOV t22, a1 + MOV t32, a1 + MOV t42, a1 + + + .align 3 +.L50: + andi I, M, 1 # mr=1,nr=4 + blez I, .L29 + nop + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 + blez L, .L55 + move BO, B + + + .align 3 +.L52: + LD a5, 1 * SIZE(AO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 # 3rd compute + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 # 4th compute + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L52 + nop + + + .align 3 +.L55: + andi L, KK, 3 + blez L, .L58 + nop + + .align 3 +.L56: + MADD t11, t11, a1, b1 # 3rd compute + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + nop + + +.L58: # deal with the triangular part + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t13, b3, t13 + SUB t14, b4, t14 + + + LD b1, 0 * SIZE(AO) # computes the triangular_part + MUL t11, b1, t11 + MUL t12, b1, t12 + MUL t13, b1, t13 + MUL t14, b1, t14 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t13, 2 * SIZE(BO) + ST t14, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + + dsubu TEMP, K, KK + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # mov AO to the end of Ai + daddu BO, BO, TEMP # mov BO to the end of Bj + + daddiu KK, KK, 1 # the length of rectangular data part increases by 2 + + .align 3 +.L29: + move B, BO # fixed panel Bj + bgtz J, .L10 + nop + + + .align 3 +.L30: + andi J, N, 2 # nr=2 + blez J, .L70 + nop + + move CO1, C + daddu CO2, C, LDC + + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + move KK, OFFSET + move AO, A # reset A + daddu C, CO2, LDC # fixed + + dsra I, M, 2 # I = mc/4 + blez I, .L40 + nop + +.L31: + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(B) # get 4b + LD b2, 1 * SIZE(B) + + dsra L, KK, 2 # L=kk/4 + blez L, .L35 + move BO, B # reset B + + + .align 3 +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L32 + nop + + + .align 3 + +.L35: + andi L, KK, 3 + blez L, .L38 + nop + + .align 3 +.L36: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + nop + + +.L38: # + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + SUB t31, b5, t31 + SUB t32, b6, t32 + SUB t41, b7, t41 + SUB t42, b8, t42 + + LD a1, 0 * SIZE(AO) # sa stores in col major + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MUL t11, a1, t11 + MUL t12, a1, t12 + NMSUB t21, t21, a2, t11 + NMSUB t22, t22, a2, t12 + NMSUB t31, t31, a3, t11 + NMSUB t32, t32, a3, t12 + NMSUB t41, t41, a4, t11 + NMSUB t42, t42, a4, t12 + + + LD a5, 5 * SIZE(AO) + LD a6, 6 * SIZE(AO) + LD a7, 7 * SIZE(AO) + MUL t21, a5, t21 + MUL t22, a5, t22 + NMSUB t31, t31, a6, t21 + NMSUB t32, t32, a6, t22 + NMSUB t41, t41, a7, t21 + NMSUB t42, t42, a7, t22 + + + LD a8, 10 * SIZE(AO) + LD a1, 11 * SIZE(AO) + MUL t31, a8, t31 + MUL t32, a8, t32 + NMSUB t41, t41, a1, t31 + NMSUB t42, t42, a1, t32 + + + LD a2, 15 * SIZE(AO) + MUL t41, a2, t41 + MUL t42, a2, t42 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + ST t31, 4 * SIZE(BO) + ST t32, 5 * SIZE(BO) + ST t41, 6 * SIZE(BO) + ST t42, 7 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of Ai + daddu BO, BO, TEMP + + daddiu KK, KK, 4 # + + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + + daddiu I, I, -1 + bgtz I, .L31 + nop + + + .align 3 +.L40: + andi I, M, 2 + blez I, .L60 + nop + + MOV t12, t11 # clear result registers + MOV t22, t21 + MOV t32, t31 + MOV t42, t41 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + + dsra L, KK, 2 + blez L, .L45 + move BO, B # reset B + + + .align 3 +.L42: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L42 + nop + + .align 3 + +.L45: + andi L, KK, 3 + blez L, .L48 + nop + + .align 3 +.L46: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + nop + +.L48: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + SUB t21, b3, t21 + SUB t22, b4, t22 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b2, 1 * SIZE(AO) + MUL t11, b1, t11 + MUL t12, b1, t12 + NMSUB t21, t21, b2, t11 + NMSUB t22, t22, b2, t12 + + LD b3, 3 * SIZE(AO) + MUL t21, b3, t21 + MUL t22, b3, t22 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + ST t21, 2 * SIZE(BO) + ST t22, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 2 + MTC $0, a1 + MOV t11, a1 + MOV t21, a1 + MOV t31, a1 + MOV t41, a1 + + + .align 3 +.L60: + andi I, M, 1 # mr=1 + blez I, .L49 + nop + + MOV t12, t11 # clear result registers + MOV t22, t21 + MOV t32, t31 + MOV t42, t41 + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + + dsra L, KK, 2 + blez L, .L65 + move BO, B # reset B + + + .align 3 +.L62: + LD a5, 1 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L62 + nop + + .align 3 + +.L65: + andi L, KK, 3 + blez L, .L68 + nop + + .align 3 +.L66: + MADD t11, t11, a1, b1 # 3rd compute + MADD t12, t12, a1, b2 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + nop + +.L68: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + MUL t11, b1, t11 + MUL t12, b1, t12 + + ST t11, 0 * SIZE(BO) + ST t12, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 1 + + .align 3 +.L49: + move B, BO + + .align 3 + +.L70: + andi J, N, 1 # nr=1 + blez J, .L999 # END + nop + + move CO1, C + + move KK, OFFSET + move AO, A + + dsra I, M, 2 + blez I, .L80 + nop + +.L71: + MTC $0, t11 # clear result regusters + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai + LD a2, 1 * SIZE(AO) # mr*KK with nr*KK + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) # get 4a + + LD b1, 0 * SIZE(B) # get 4b + + dsra L, KK, 2 + blez L, .L75 + move BO, B # reset B + + .align 3 +.L72: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 # 3rd compute + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 # 4th compute + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L72 + nop + + .align 3 + +.L75: + andi L, KK, 3 + blez L, .L78 + nop + + .align 3 +.L76: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + nop + +.L78: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD a1, 0 * SIZE(AO) # sa stores in col major + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MUL t11, a1, t11 + NMSUB t21, t21, a2, t11 + NMSUB t31, t31, a3, t11 + NMSUB t41, t41, a4, t11 + + LD a5, 5 * SIZE(AO) + LD a6, 6 * SIZE(AO) + LD a7, 7 * SIZE(AO) + MUL t21, a5, t21 + NMSUB t31, t31, a6, t21 + NMSUB t41, t41, a7, t21 + + LD a8, 10 * SIZE(AO) + LD a1, 11 * SIZE(AO) + MUL t31, a8, t31 + NMSUB t41, t41, a1, t31 + + LD a2, 15 * SIZE(AO) + MUL t41, a2, t41 + + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + ST t31, 2 * SIZE(BO) + ST t41, 3 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + daddiu CO1, CO1, 4 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 4 + daddiu I, I, -1 + bgtz I, .L71 + nop + + + .align 3 + +.L80: + andi I, M, 2 + blez I, .L90 + NOP + + MTC $0, t11 + MOV t21, t11 # clear result registers + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B + + .align 3 +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + MADD t21, t21, a4, b3 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + nop + + .align 3 + +.L85: + andi L, KK, 3 + blez L, .L88 + nop + + .align 3 +.L86: + MADD t11, t11, a1, b1 # 3rd compute + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + nop + + +.L88: + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b2, 1 * SIZE(AO) + MUL t11, b1, t11 + NMSUB t21, t21, b2, t11 + + LD b3, 3 * SIZE(AO) + MUL t21, b3, t21 + + ST t11, 0 * SIZE(BO) + ST t21, 1 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + ST t21, 1 * SIZE(CO1) + + + daddiu CO1, CO1, 2 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 2 + + + .align 3 +.L90: + andi I, M, 1 # mr=1 + blez I, .L89 + NOP + + MTC $0, t11 + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(B) + + dsra L, KK, 2 + blez L, .L95 + move BO, B + + .align 3 +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 # 1st compute + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 # 2ed compute + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 # 3rd compute + + daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 # 4th compute + + daddiu L, L, -1 + bgtz L, .L92 + nop + + .align 3 +.L95: + andi L, KK, 3 + blez L, .L98 + nop + + .align 3 +.L96: + MADD t11, t11, a1, b1 # 3rd compute + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) # next + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + nop + + +.L98: + LD b1, 0 * SIZE(BO) + + SUB t11, b1, t11 + + LD b1, 0 * SIZE(AO) # computes the triangular_part + MUL t11, b1, t11 + + ST t11, 0 * SIZE(BO) + + ST t11, 0 * SIZE(CO1) + + + daddiu CO1, CO1, 1 * SIZE + + dsubu TEMP, K, KK + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP + + daddiu KK, KK, 1 + + + .align 3 +.L89: + move B, BO + + + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_RN_loongson3a.S b/kernel/mips64/trsm_kernel_RN_loongson3a.S new file mode 100644 index 000000000..790d7c981 --- /dev/null +++ b/kernel/mips64/trsm_kernel_RN_loongson3a.S @@ -0,0 +1,1852 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define a5 $f28 +#define a6 $f29 +#define a7 $f30 +#define a8 $f31 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 + +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define t11 $f10 +#define t21 $f11 +#define t31 $f12 +#define t41 $f13 + +#define t12 $f14 +#define t22 $f15 +#define t32 $f16 +#define t42 $f17 + +#define t13 $f18 +#define t23 $f19 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f22 +#define t24 $f23 +#define t34 $f24 +#define t44 $f25 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + # RN compute from top to bottom left to right + .align 3 + LDARG OFFSET, 144($sp) # get the last parameter + dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte + + neg KK, OFFSET # for RN OFFSET always 0 + + dsra J, N, 2 # J = NC/4 + blez J, .L30 + NOP + +.L10: + daddiu J, J, -1 + + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + move AO, A # A is the retangular matrix and B is the trigular matrix + daddu C, CO4, LDC # Fixed pointer C + + dsra I, M, 2 # I=MC/4 + blez I, .L20 + NOP + + .align 3 +.L11: + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L15 + move BO, B # reset B + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # fisrt + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # second + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # third + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # fouth + + daddiu L, L, -1 + bgtz L, .L12 + NOP + + +.L15: + andi L, KK, 3 # deal with kc remainder part + blez L, .L18 + NOP + + .align 3 +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BP += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + NOP + + + .align 3 +.L18: # .L18 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) # sa stored as col major + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + LD b1, 8 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 10 * SIZE(AO) + LD b4, 11 * SIZE(AO) + + SUB t13, b1, t13 + SUB t23, b2, t23 + SUB t33, b3, t33 + SUB t43, b4, t43 + + LD b5, 12 * SIZE(AO) + LD b6, 13 * SIZE(AO) + LD b7, 14 * SIZE(AO) + LD b8, 15 * SIZE(AO) + + SUB t14, b5, t14 + SUB t24, b6, t24 + SUB t34, b7, t34 + SUB t44, b8, t44 + + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + MUL t31, b1, t31 + MUL t41, b1, t41 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + NMSUB t32, t32, b2, t31 + NMSUB t42, t42, b2, t41 + NMSUB t13, t13, b3, t11 + NMSUB t23, t23, b3, t21 + NMSUB t33, t33, b3, t31 + NMSUB t43, t43, b3, t41 + NMSUB t14, t14, b4, t11 + NMSUB t24, t24, b4, t21 + NMSUB t34, t34, b4, t31 + NMSUB t44, t44, b4, t41 + + + LD b5, 5 * SIZE(BO) + LD b6, 6 * SIZE(BO) + LD b7, 7 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + MUL t32, b5, t32 + MUL t42, b5, t42 + NMSUB t13, t13, b6, t12 + NMSUB t23, t23, b6, t22 + NMSUB t33, t33, b6, t32 + NMSUB t43, t43, b6, t42 + NMSUB t14, t14, b7, t12 + NMSUB t24, t24, b7, t22 + NMSUB t34, t34, b7, t32 + NMSUB t44, t44, b7, t42 + + + + LD b8, 10 * SIZE(BO) + LD b1, 11 * SIZE(BO) + MUL t13, b8, t13 + MUL t23, b8, t23 + MUL t33, b8, t33 + MUL t43, b8, t43 + NMSUB t14, t14, b1, t13 + NMSUB t24, t24, b1, t23 + NMSUB t34, t34, b1, t33 + NMSUB t44, t44, b1, t43 + + + + LD b2, 15 * SIZE(BO) + MUL t14, b2, t14 + MUL t24, b2, t24 + MUL t34, b2, t34 + MUL t44, b2, t44 + + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t13, 8 * SIZE(AO) + ST t23, 9 * SIZE(AO) + ST t33, 10 * SIZE(AO) + ST t43, 11 * SIZE(AO) + + ST t14, 12 * SIZE(AO) + ST t24, 13 * SIZE(AO) + ST t34, 14 * SIZE(AO) + ST t44, 15 * SIZE(AO) + + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE # fixed address + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + daddiu I, I, -1 + bgtz I, .L11 + NOP + + .align 3 +.L20: + andi I, M, 2 # mr=2 + blez I, .L50 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L25 + move BO, B # reset B + +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t21, t21, a4, b1 + + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t21, t21, a8, b5 + + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + NOP + + +.L25: + andi L, KK, 3 # deal with kc remainder part + blez L, .L28 + NOP + + .align 3 +.L26: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BP += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + NOP + + + .align 3 +.L28: # .L18 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b5, 2 * SIZE(AO) + LD b6, 3 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + + LD b3, 4 * SIZE(AO) + LD b4, 5 * SIZE(AO) + + SUB t13, b3, t13 + SUB t23, b4, t23 + + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t14, b7, t14 + SUB t24, b8, t24 + + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + NMSUB t13, t13, b3, t11 + NMSUB t23, t23, b3, t21 + NMSUB t14, t14, b4, t11 + NMSUB t24, t24, b4, t21 + + + LD b5, 5 * SIZE(BO) + LD b6, 6 * SIZE(BO) + LD b7, 7 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + NMSUB t13, t13, b6, t12 + NMSUB t23, t23, b6, t22 + NMSUB t14, t14, b7, t12 + NMSUB t24, t24, b7, t22 + + + + LD b8, 10 * SIZE(BO) + LD b1, 11 * SIZE(BO) + MUL t13, b8, t13 + MUL t23, b8, t23 + NMSUB t14, t14, b1, t13 + NMSUB t24, t24, b1, t23 + + + + LD b2, 15 * SIZE(BO) + MUL t14, b2, t14 + MUL t24, b2, t24 + + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t13, 4 * SIZE(AO) + ST t23, 5 * SIZE(AO) + + ST t14, 6 * SIZE(AO) + ST t24, 7 * SIZE(AO) + + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE # fixed address + daddiu CO2, CO2, 2 * SIZE # mr=2 + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 1 + BASE_SHIFT # mr=2 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + .align 3 +.L50: + andi I, M, 1 # mr=1 + blez I, .L29 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L55 + move BO, B # reset B + +.L52: + LD a5, 1 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L52 + NOP + + +.L55: + andi L, KK, 3 # deal with kc remainder part + blez L, .L58 + NOP + + .align 3 +.L56: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 4 * SIZE # BP += 4nr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + NOP + + + .align 3 +.L58: # .L18 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b5, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b7, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t12, b5, t12 + SUB t13, b3, t13 + SUB t14, b7, t14 + + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MUL t11, b1, t11 + NMSUB t12, t12, b2, t11 + NMSUB t13, t13, b3, t11 + NMSUB t14, t14, b4, t11 + + + LD b5, 5 * SIZE(BO) + LD b6, 6 * SIZE(BO) + LD b7, 7 * SIZE(BO) + MUL t12, b5, t12 + NMSUB t13, t13, b6, t12 + NMSUB t14, t14, b7, t12 + + + LD b8, 10 * SIZE(BO) + LD b1, 11 * SIZE(BO) + MUL t13, b8, t13 + NMSUB t14, t14, b1, t13 + + + LD b2, 15 * SIZE(BO) + MUL t14, b2, t14 + + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t12, 1 * SIZE(AO) + ST t13, 2 * SIZE(AO) + ST t14, 3 * SIZE(AO) + + + ST t11, 0 * SIZE(CO1) # write back results + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE # fixed address + daddiu CO2, CO2, 1 * SIZE # mr=2 + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, BASE_SHIFT # mr=2 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L29: + move B, BO # change to next panel of Bj + daddiu KK, KK, 4 # rectangular data length increase by 4 + bgtz J, .L10 + NOP + + + .align 3 + +.L30: + andi J, N, 2 + blez J, .L70 + nop + + move CO1, C + daddu CO2, C, LDC + + move AO, A # A is the retangular matrix and B is the trigular matrix + daddu C, CO2, LDC # Fixed pointer C + + dsra I, M, 2 # I=MC/4 + blez I, .L40 + NOP + + .align 3 +.L31: + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L35 + move BO, B # reset B + +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L32 + NOP + + +.L35: + andi L, KK, 3 # deal with kc remainder part + blez L, .L38 + NOP + + .align 3 +.L36: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BP += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + NOP + + + .align 3 +.L38: # .L38 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) # sa stored as col major + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + MUL t31, b1, t31 + MUL t41, b1, t41 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + NMSUB t32, t32, b2, t31 + NMSUB t42, t42, b2, t41 + + LD b5, 3 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + MUL t32, b5, t32 + MUL t42, b5, t42 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE # fixed address + daddiu CO2, CO2, 4 * SIZE + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + daddiu I, I, -1 + bgtz I, .L31 + NOP + + .align 3 +.L40: + andi I, M,2 + blez I,.L60 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + + MOV t12, t11 + MOV t22, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L45 + move BO, B # reset B + +.L42: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L42 + NOP + + +.L45: + andi L, KK, 3 # deal with kc remainder part + blez L, .L48 + NOP + + .align 3 +.L46: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BP += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + NOP + + + .align 3 +.L48: # .L48 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b5, 2 * SIZE(AO) + LD b6, 3 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + MUL t11, b1, t11 + MUL t21, b1, t21 + NMSUB t12, t12, b2, t11 + NMSUB t22, t22, b2, t21 + + LD b5, 3 * SIZE(BO) + MUL t12, b5, t12 + MUL t22, b5, t22 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE # fixed address + daddiu CO2, CO2, 2 * SIZE + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L60: + andi I,M,1 # nr=2 mr=1 + blez I,.L39 + nop + + MTC $0, t11 # clear results registers + MOV t12, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + LD b2, 1 * SIZE(B) # get 4 b + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L65 + move BO, B # reset B + +.L62: + LD a5, 1 * SIZE(AO) + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L62 + NOP + + +.L65: + andi L, KK, 3 # deal with kc remainder part + blez L, .L68 + NOP + + .align 3 +.L66: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + daddiu AO, AO, 1 * SIZE # AO += mr + daddiu BO, BO, 2 * SIZE # BP += 2nr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + NOP + + + .align 3 +.L68: # .L48 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b5, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t12, b5, t12 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + LD b2, 1 * SIZE(BO) + MUL t11, b1, t11 + NMSUB t12, t12, b2, t11 + + LD b5, 3 * SIZE(BO) + MUL t12, b5, t12 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t12, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t12, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE # fixed address + daddiu CO2, CO2, 1 * SIZE + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L39: + move B, BO # change to next panel of Bj + daddiu KK, KK, 2 # rectangular data length increase by 4 + + + + .align 3 + +.L70: + andi J, N, 1 # nr=1 + blez J, .L999 + NOP + + move CO1, C + move AO, A + + daddu C, CO1, LDC + + dsra I, M, 2 # I=MC/4 + blez I, .L80 + NOP + + .align 3 +.L71: + MTC $0, t11 # clear results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L75 + move BO, B # reset B + +.L72: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L72 + NOP + + +.L75: + andi L, KK, 3 # deal with kc remainder part + blez L, .L78 + NOP + + .align 3 +.L76: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BP += 1nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + NOP + + + .align 3 +.L78: # .L78 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) # sa stored as col major + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + MUL t11, b1, t11 + MUL t21, b1, t21 + MUL t31, b1, t31 + MUL t41, b1, t41 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + + daddiu CO1, CO1, 4 * SIZE # fixed address + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + daddiu I, I, -1 + bgtz I, .L71 + NOP + + + .align 3 +.L80: + andi I, M, 2 # mr=2 + blez I, .L90 + nop + + MTC $0, t11 # clear results registers + MOV t21, t11 + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD a2, 1 * SIZE(AO) # get 4 a + + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L85 + move BO, B # reset B + +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + NOP + + +.L85: + andi L, KK, 3 # deal with kc remainder part + blez L, .L88 + NOP + + .align 3 +.L86: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BP += 1nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + NOP + + + .align 3 +.L88: # .L88 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + LD b2, 1 * SIZE(AO) # Fixed results + + SUB t11, b1, t11 + SUB t21, b2, t21 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + MUL t11, b1, t11 + MUL t21, b1, t21 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + ST t21, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back results + ST t21, 1 * SIZE(CO1) + + + daddiu CO1, CO1, 2 * SIZE # fixed address + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L90: + andi I, M, 1 # mr=1 + blez I, .L79 + nop + + MTC $0, t11 # clear results registers + + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa + LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj + + dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj + blez L, .L95 + move BO, B # reset B + +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + + daddiu L, L, -1 + bgtz L, .L92 + NOP + + +.L95: + andi L, KK, 3 # deal with kc remainder part + blez L, .L98 + NOP + + .align 3 +.L96: + MADD t11, t11, a1, b1 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BP += 1nr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + NOP + + + .align 3 +.L98: # .L98 always deal with the trigular data part + LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix + + SUB t11, b1, t11 + + + LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj + MUL t11, b1, t11 + + + ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute + + ST t11, 0 * SIZE(CO1) # write back results + + + daddiu CO1, CO1, 1 * SIZE # fixed address + + dsubu TEMP, K, KK # temp = kc - retangular data length of every panel + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel + daddu BO, BO, TEMP # move BO to the end of this panel + + + .align 3 +.L79: + move B, BO + daddiu KK, KK, 1 + + + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_RT_loongson3a.S b/kernel/mips64/trsm_kernel_RT_loongson3a.S new file mode 100644 index 000000000..cf20cf9e0 --- /dev/null +++ b/kernel/mips64/trsm_kernel_RT_loongson3a.S @@ -0,0 +1,1958 @@ +#define REALNAME ASMNAME + +#define ASSEMBLER +#include "common.h" + + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define a5 $f28 +#define a6 $f29 +#define a7 $f30 +#define a8 $f31 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 + +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define t11 $f10 +#define t21 $f11 +#define t31 $f12 +#define t41 $f13 + +#define t12 $f14 +#define t22 $f15 +#define t32 $f16 +#define t42 $f17 + +#define t13 $f18 +#define t23 $f19 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f22 +#define t24 $f23 +#define t34 $f24 +#define t44 $f25 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + + .align 3 # RT compute from right to left + LDARG OFFSET, 144($sp) # get the last parameter + dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte + + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT # B Representative triangle matrix!!! + daddu B, B, TEMP # B point to the end of sb + # Be carefull B has no effeck of mc!! + mult N, LDC + mflo TEMP + daddu C, C, TEMP # C point to the last colum of blockB + + dsubu KK, K, OFFSET # KC-KK is the length of rectangular data part of Bj + + andi J, N, 1 + blez J, .L30 + nop + + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP # move B to the beginning address of Bj + + dsubu C, C, LDC + + move CO1, C + + move AORIG, A + + dsra I, M, 2 + blez I, .L80 + NOP + +.L31: # mr=4,nr=1 + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L35 + NOP + + .align 3 + +.L32: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + daddiu L, L, -1 + bgtz L, .L32 + NOP + + + .align 3 + +.L35: + andi L, TEMP, 3 + blez L, .L38 + NOP + .align 3 + +.L36: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 1 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L36 + NOP + + + .align +.L38: + daddiu TEMP, KK, -1 # deal with the triangular data part + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + MUL t31, b2, t31 + MUL t41, b2, t41 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + + daddiu CO1, CO1, 4 * SIZE # fixed pointer + + dsll TEMP, K, 2 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + daddiu I, I, -1 + bgtz I, .L31 + NOP + + + .align 3 +.L80: + andi I, M, 2 + blez I, .L90 + nop + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + MOV t21, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L85 + NOP + + .align 3 + +.L82: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + + daddiu L, L, -1 + bgtz L, .L82 + NOP + + + .align 3 + +.L85: + andi L, TEMP, 3 + blez L, .L88 + NOP + .align 3 + +.L86: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L86 + NOP + + + .align +.L88: + daddiu TEMP, KK, -1 # deal with the triangular data part + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + + + daddiu CO1, CO1, 2 * SIZE # fixed pointer + + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + + .align 3 +.L90: + andi I, M, 1 + blez I, .L39 + nop + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L95 + NOP + + .align 3 + +.L92: + LD a5, 1 * SIZE(AO) + LD b5, 1 * SIZE(BO) + + MADD t11, t11, a1, b1 + + LD a3, 2 * SIZE(AO) + LD b3, 2 * SIZE(BO) + + MADD t11, t11, a5, b5 + + LD a7, 3 * SIZE(AO) + LD b7, 3 * SIZE(BO) + + MADD t11, t11, a3, b3 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD t11, t11, a7, b7 + + daddiu L, L, -1 + bgtz L, .L92 + NOP + + + .align 3 + +.L95: + andi L, TEMP, 3 + blez L, .L98 + NOP + .align 3 + +.L96: + MADD t11, t11, a1, b1 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 1 * SIZE # BO += 1nr + + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L96 + NOP + + + .align +.L98: + daddiu TEMP, KK, -1 # deal with the triangular data part + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT # nr=1 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + + SUB t11, b1, t11 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + + + ST t11, 0 * SIZE(AO) # updata packed A + + ST t11, 0 * SIZE(CO1) # write back + + + daddiu CO1, CO1, 1 * SIZE # fixed pointer + + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + +.L39: + daddiu KK, KK, -1 # rectangular data length increased by 1 + + + .align 3 +.L30: # nr=2 + andi J, N, 2 + blez J, .L50 + nop + + dsll TEMP, K, 1 + BASE_SHIFT # Kc*2nr move B to the beginning address of Bj + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 # C + dsubu C, C, TEMP + + move CO1, C + daddu CO2, C, LDC + + move AORIG, A + + dsra I, M, 2 + blez I, .L60 + NOP + +.L51: # mr=4,nr=2 + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L55 + NOP + + .align 3 + +.L52: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b3 + MADD t21, t21, a2, b3 + MADD t31, t31, a3, b3 + MADD t41, t41, a4, b3 + + MADD t12, t12, a1, b4 + MADD t22, t22, a2, b4 + MADD t32, t32, a3, b4 + MADD t42, t42, a4, b4 + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a5, b7 + MADD t21, t21, a6, b7 + MADD t31, t31, a7, b7 + MADD t41, t41, a8, b7 + + MADD t12, t12, a5, b8 + MADD t22, t22, a6, b8 + MADD t32, t32, a7, b8 + MADD t42, t42, a8, b8 + + daddiu L, L, -1 + bgtz L, .L52 + NOP + + + .align 3 + +.L55: + andi L, TEMP, 3 + blez L, .L58 + NOP + .align 3 + +.L56: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L56 + NOP + + + .align +.L58: + daddiu TEMP, KK, -2 # deal with the triangular data part + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + + LD b8, 3 * SIZE(BO) + LD b1, 2 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + MUL t32, b8, t32 + MUL t42, b8, t42 + NMSUB t11, t11, b1, t12 + NMSUB t21, t21, b1, t22 + NMSUB t31, t31, b1, t32 + NMSUB t41, t41, b1, t42 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + MUL t31, b2, t31 + MUL t41, b2, t41 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE # fixed pointer + daddiu CO2, CO2, 4 * SIZE + + dsll TEMP, K, 2 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + daddiu I, I, -1 + bgtz I, .L51 + NOP + + + + .align 3 +.L60: + andi I, M, 2 # mr=2 + blez I, .L70 + nop + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t21, t11 + MOV t12, t11 + MOV t22, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L65 + NOP + + .align 3 + +.L62: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t21, t21, a4, b3 + + MADD t12, t12, a3, b4 + MADD t22, t22, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t21, t21, a8, b7 + + MADD t12, t12, a7, b8 + MADD t22, t22, a8, b8 + + daddiu L, L, -1 + bgtz L, .L62 + NOP + + + .align 3 + +.L65: + andi L, TEMP, 3 + blez L, .L68 + NOP + .align 3 + +.L66: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L66 + NOP + + + .align +.L68: + daddiu TEMP, KK, -2 # deal with the triangular data part + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t12, b3, t12 + SUB t22, b4, t22 + + + LD b8, 3 * SIZE(BO) + LD b7, 2 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + NMSUB t11, t11, b7, t12 + NMSUB t21, t21, b7, t22 + + + LD b6, 0 * SIZE(BO) + MUL t11, b6, t11 + MUL t21, b6, t21 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE # fixed pointer + daddiu CO2, CO2, 2 * SIZE + + dsll TEMP, K, 1 + BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + + + .align 3 +.L70: + andi I, M, 1 # mr=1 + blez I, .L59 + nop + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t12, t11 + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L75 + NOP + + .align 3 + +.L72: + LD a5, 1 * SIZE(AO) + + LD b5, 2 * SIZE(BO) + LD b6, 3 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + LD a3, 2 * SIZE(AO) + + LD b3, 4 * SIZE(BO) + LD b4, 5 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + + LD a7, 3 * SIZE(AO) + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a3, b3 + MADD t12, t12, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MADD t11, t11, a7, b7 + MADD t12, t12, a7, b8 + + daddiu L, L, -1 + bgtz L, .L72 + NOP + + + .align 3 + +.L75: + andi L, TEMP, 3 + blez L, .L78 + NOP + .align 3 + +.L76: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + + daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu BO, BO, 2 * SIZE # BO += 2nr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L76 + NOP + + + .align +.L78: + daddiu TEMP, KK, -2 # deal with the triangular data part + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + + SUB t11, b1, t11 + SUB t12, b2, t12 + + + LD b8, 3 * SIZE(BO) + LD b7, 2 * SIZE(BO) + MUL t12, b8, t12 + NMSUB t11, t11, b7, t12 + + + LD b6, 0 * SIZE(BO) + MUL t11, b6, t11 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t12, 1 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t12, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE # fixed pointer + daddiu CO2, CO2, 1 * SIZE + + dsll TEMP, K, BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + +.L59: + daddiu KK, KK, -2 # rectangular data length increased by 2 + + + + .align 3 +.L50: + dsra J, N, 2 # J = NC/4 + blez J, .L999 + NOP + +.L10: + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP # move B to the beginning address of Bj + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP # move C to the beginning address of Cj + + daddiu J, J, -1 + + move CO1, C + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + + move AORIG, A # reset A + + dsra I, M, 2 # I=MC/4 + blez I, .L20 + NOP + + .align 3 +.L11: + dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 16 results registers + MOV t21, t11 + MOV t31, t11 + MOV t41, t11 + MOV t12, t11 + MOV t22, t11 + MOV t32, t11 + MOV t42, t11 + MOV t13, t11 + MOV t23, t11 + MOV t33, t11 + MOV t43, t11 + MOV t14, t11 + MOV t24, t11 + MOV t34, t11 + MOV t44, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L15 + NOP + + .align 3 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # fisrt + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # second + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # third + + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + MADD t31, t31, a7, b5 + MADD t41, t41, a8, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + MADD t32, t32, a7, b6 + MADD t42, t42, a8, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + MADD t33, t33, a7, b7 + MADD t43, t43, a8, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + MADD t34, t34, a7, b8 + MADD t44, t44, a8, b8 # fouth + + daddiu L, L, -1 + bgtz L, .L12 + NOP + + + .align 3 + +.L15: + andi L, TEMP, 3 + blez L, .L18 + NOP + .align 3 + +.L16: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + MADD t31, t31, a3, b1 + MADD t41, t41, a4, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + MADD t32, t32, a3, b2 + MADD t42, t42, a4, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + MADD t33, t33, a3, b3 + MADD t43, t43, a4, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + MADD t34, t34, a3, b4 + MADD t44, t44, a4, b4 # third + + daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L16 + NOP + + + .align +.L18: + daddiu TEMP, KK, -4 # deal with the triangular data part + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + SUB t31, b3, t31 + SUB t41, b4, t41 + + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + SUB t32, b7, t32 + SUB t42, b8, t42 + + LD b1, 8 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 10 * SIZE(AO) + LD b4, 11 * SIZE(AO) + + SUB t13, b1, t13 + SUB t23, b2, t23 + SUB t33, b3, t33 + SUB t43, b4, t43 + + LD b5, 12 * SIZE(AO) + LD b6, 13 * SIZE(AO) + LD b7, 14 * SIZE(AO) + LD b8, 15 * SIZE(AO) + + SUB t14, b5, t14 + SUB t24, b6, t24 + SUB t34, b7, t34 + SUB t44, b8, t44 + + + LD b1, 15 * SIZE(BO) + LD b2, 14 * SIZE(BO) + LD b3, 13 * SIZE(BO) + LD b4, 12 * SIZE(BO) + MUL t14, b1, t14 + MUL t24, b1, t24 + MUL t34, b1, t34 + MUL t44, b1, t44 + NMSUB t13, t13, b2, t14 + NMSUB t23, t23, b2, t24 + NMSUB t33, t33, b2, t34 + NMSUB t43, t43, b2, t44 + NMSUB t12, t12, b3, t14 + NMSUB t22, t22, b3, t24 + NMSUB t32, t32, b3, t34 + NMSUB t42, t42, b3, t44 + NMSUB t11, t11, b4, t14 + NMSUB t21, t21, b4, t24 + NMSUB t31, t31, b4, t34 + NMSUB t41, t41, b4, t44 + + + LD b5, 10 * SIZE(BO) + LD b6, 9 * SIZE(BO) + LD b7, 8 * SIZE(BO) + MUL t13, b5, t13 + MUL t23, b5, t23 + MUL t33, b5, t33 + MUL t43, b5, t43 + NMSUB t12, t12, b6, t13 + NMSUB t22, t22, b6, t23 + NMSUB t32, t32, b6, t33 + NMSUB t42, t42, b6, t43 + NMSUB t11, t11, b7, t13 + NMSUB t21, t21, b7, t23 + NMSUB t31, t31, b7, t33 + NMSUB t41, t41, b7, t43 + + + LD b8, 5 * SIZE(BO) + LD b1, 4 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + MUL t32, b8, t32 + MUL t42, b8, t42 + NMSUB t11, t11, b1, t12 + NMSUB t21, t21, b1, t22 + NMSUB t31, t31, b1, t32 + NMSUB t41, t41, b1, t42 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + MUL t31, b2, t31 + MUL t41, b2, t41 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + ST t31, 2 * SIZE(AO) + ST t41, 3 * SIZE(AO) + + ST t12, 4 * SIZE(AO) + ST t22, 5 * SIZE(AO) + ST t32, 6 * SIZE(AO) + ST t42, 7 * SIZE(AO) + + ST t13, 8 * SIZE(AO) + ST t23, 9 * SIZE(AO) + ST t33, 10 * SIZE(AO) + ST t43, 11 * SIZE(AO) + + ST t14, 12 * SIZE(AO) + ST t24, 13 * SIZE(AO) + ST t34, 14 * SIZE(AO) + ST t44, 15 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + ST t31, 2 * SIZE(CO1) + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + ST t33, 2 * SIZE(CO3) + ST t43, 3 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE # fixed pointer + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + dsll TEMP, K, 2 + BASE_SHIFT + daddu AORIG, AORIG, TEMP # move to next panel Ai + + daddiu I, I, -1 + bgtz I, .L11 + NOP + + .align 3 +.L20: + andi I, M, 2 # mr=2 + blez I, .L40 + NOP + + dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 8 results registers + MOV t21, t11 + MOV t12, t11 + MOV t22, t11 + MOV t13, t11 + MOV t23, t11 + MOV t14, t11 + MOV t24, t11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L25 + NOP + + .align 3 + +.L22: + LD a5, 2 * SIZE(AO) + LD a6, 3 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t21, t21, a6, b5 + + MADD t12, t12, a5, b6 + MADD t22, t22, a6, b6 + + MADD t13, t13, a5, b7 + MADD t23, t23, a6, b7 + + MADD t14, t14, a5, b8 + MADD t24, t24, a6, b8 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t21, t21, a4, b1 + + MADD t12, t12, a3, b2 + MADD t22, t22, a4, b2 + + MADD t13, t13, a3, b3 + MADD t23, t23, a4, b3 + + MADD t14, t14, a3, b4 + MADD t24, t24, a4, b4 + + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t21, t21, a8, b5 + + MADD t12, t12, a7, b6 + MADD t22, t22, a8, b6 + + MADD t13, t13, a7, b7 + MADD t23, t23, a8, b7 + + MADD t14, t14, a7, b8 + MADD t24, t24, a8, b8 + + daddiu L, L, -1 + bgtz L, .L22 + NOP + + + .align 3 + +.L25: + andi L, TEMP, 3 + blez L, .L28 + NOP + .align 3 + +.L26: + MADD t11, t11, a1, b1 + MADD t21, t21, a2, b1 + + MADD t12, t12, a1, b2 + MADD t22, t22, a2, b2 + + MADD t13, t13, a1, b3 + MADD t23, t23, a2, b3 + + MADD t14, t14, a1, b4 + MADD t24, t24, a2, b4 + + daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L26 + NOP + + + .align +.L28: + daddiu TEMP, KK, -4 # deal with the triangular data part + dsll L, TEMP, 1 + BASE_SHIFT # mr=2 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b2, 1 * SIZE(AO) + + SUB t11, b1, t11 + SUB t21, b2, t21 + + LD b5, 2 * SIZE(AO) + LD b6, 3 * SIZE(AO) + + SUB t12, b5, t12 + SUB t22, b6, t22 + + LD b3, 4 * SIZE(AO) + LD b4, 5 * SIZE(AO) + + SUB t13, b3, t13 + SUB t23, b4, t23 + + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB t14, b7, t14 + SUB t24, b8, t24 + + + LD b1, 15 * SIZE(BO) + LD b2, 14 * SIZE(BO) + LD b3, 13 * SIZE(BO) + LD b4, 12 * SIZE(BO) + MUL t14, b1, t14 + MUL t24, b1, t24 + NMSUB t13, t13, b2, t14 + NMSUB t23, t23, b2, t24 + NMSUB t12, t12, b3, t14 + NMSUB t22, t22, b3, t24 + NMSUB t11, t11, b4, t14 + NMSUB t21, t21, b4, t24 + + + LD b5, 10 * SIZE(BO) + LD b6, 9 * SIZE(BO) + LD b7, 8 * SIZE(BO) + MUL t13, b5, t13 + MUL t23, b5, t23 + NMSUB t12, t12, b6, t13 + NMSUB t22, t22, b6, t23 + NMSUB t11, t11, b7, t13 + NMSUB t21, t21, b7, t23 + + + LD b8, 5 * SIZE(BO) + LD b1, 4 * SIZE(BO) + MUL t12, b8, t12 + MUL t22, b8, t22 + NMSUB t11, t11, b1, t12 + NMSUB t21, t21, b1, t22 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + MUL t21, b2, t21 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t21, 1 * SIZE(AO) + + ST t12, 2 * SIZE(AO) + ST t22, 3 * SIZE(AO) + + ST t13, 4 * SIZE(AO) + ST t23, 5 * SIZE(AO) + + ST t14, 6 * SIZE(AO) + ST t24, 7 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t21, 1 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + ST t22, 1 * SIZE(CO2) + + ST t13, 0 * SIZE(CO3) + ST t23, 1 * SIZE(CO3) + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE # fixed pointer + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + dsll TEMP, K, 1 + BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + + .align 3 +.L40: + andi I, M, 1 + blez I, .L29 + NOP + + dsll L, KK, BASE_SHIFT # mr=1 + dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the retangular data part,also reset BO + dsubu TEMP, K, KK # temp = the length of rectangular data part + + MTC $0, t11 # clear 4 results registers + MOV t12, t11 + MOV t13, t11 + MOV t14, t11 + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + dsra L, TEMP, 2 # L=(KC-offset)/4 + blez L, .L45 + NOP + + .align 3 + +.L42: + LD a5, 1 * SIZE(AO) + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + LD a3, 2 * SIZE(AO) + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MADD t11, t11, a5, b5 + MADD t12, t12, a5, b6 + MADD t13, t13, a5, b7 + MADD t14, t14, a5, b8 + + LD a7, 3 * SIZE(AO) + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MADD t11, t11, a3, b1 + MADD t12, t12, a3, b2 + MADD t13, t13, a3, b3 + MADD t14, t14, a3, b4 + + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MADD t11, t11, a7, b5 + MADD t12, t12, a7, b6 + MADD t13, t13, a7, b7 + MADD t14, t14, a7, b8 + + daddiu L, L, -1 + bgtz L, .L42 + NOP + + + .align 3 + +.L45: + andi L, TEMP, 3 + blez L, .L48 + NOP + .align 3 + +.L46: + MADD t11, t11, a1, b1 + MADD t12, t12, a1, b2 + MADD t13, t13, a1, b3 + MADD t14, t14, a1, b4 + + daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu BO, BO, 4 * SIZE # BO += 4nr + + LD a1, 0 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + daddiu L, L, -1 + bgtz L, .L46 + NOP + + + .align +.L48: + daddiu TEMP, KK, -4 # deal with the triangular data part + dsll L, TEMP, BASE_SHIFT # mr=1 + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP # BO point to the trigular data part + + LD b1, 0 * SIZE(AO) # fixed results + LD b5, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b7, 3 * SIZE(AO) + + SUB t11, b1, t11 + SUB t12, b5, t12 + SUB t13, b3, t13 + SUB t14, b7, t14 + + + LD b1, 15 * SIZE(BO) + LD b2, 14 * SIZE(BO) + LD b3, 13 * SIZE(BO) + LD b4, 12 * SIZE(BO) + MUL t14, b1, t14 + NMSUB t13, t13, b2, t14 + NMSUB t12, t12, b3, t14 + NMSUB t11, t11, b4, t14 + + + LD b5, 10 * SIZE(BO) + LD b6, 9 * SIZE(BO) + LD b7, 8 * SIZE(BO) + MUL t13, b5, t13 + NMSUB t12, t12, b6, t13 + NMSUB t11, t11, b7, t13 + + + LD b8, 5 * SIZE(BO) + LD b1, 4 * SIZE(BO) + MUL t12, b8, t12 + NMSUB t11, t11, b1, t12 + + + LD b2, 0 * SIZE(BO) + MUL t11, b2, t11 + + + ST t11, 0 * SIZE(AO) # updata packed A + ST t12, 1 * SIZE(AO) + ST t13, 2 * SIZE(AO) + ST t14, 3 * SIZE(AO) + + ST t11, 0 * SIZE(CO1) # write back + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE # fixed pointer + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + dsll TEMP, K, BASE_SHIFT # mr=2 + daddu AORIG, AORIG, TEMP # move to next panel Ai + + +.L29: + daddiu KK, KK, -4 # rectangular data part increased by 4 + bgtz J, .L10 + NOP + + + + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/param.h b/param.h index 8fcd19358..603caab46 100644 --- a/param.h +++ b/param.h @@ -1480,27 +1480,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 108 -#define DGEMM_DEFAULT_P 112 +#define SGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 108 #define ZGEMM_DEFAULT_P 112 -#define SGEMM_DEFAULT_Q 288 -#define DGEMM_DEFAULT_Q 144 +#define SGEMM_DEFAULT_Q 116 +#define DGEMM_DEFAULT_Q 116 #define CGEMM_DEFAULT_Q 144 #define ZGEMM_DEFAULT_Q 72 -#define SGEMM_DEFAULT_R 2000 -#define DGEMM_DEFAULT_R 2000 +#define SGEMM_DEFAULT_R 1000 +#define DGEMM_DEFAULT_R 1000 #define CGEMM_DEFAULT_R 2000 #define ZGEMM_DEFAULT_R 2000