diff --git a/README b/README index 79ab48d8b..c8c2c2c55 100644 --- a/README +++ b/README @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve 9.Known Issues * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). diff --git a/common_macro.h b/common_macro.h index bcaa9f38b..0c34ecb01 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2127,7 +2127,9 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) +extern BLASLONG gemm_offset_a; +extern BLASLONG gemm_offset_b; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_mips64.h b/common_mips64.h index acea79011..35d8265bc 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -152,6 +152,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.d #define CMPLE c.le.d #define CMPLT c.lt.d +#define NEG neg.d #else #define LD lwc1 #define ST swc1 @@ -170,6 +171,14 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.s #define CMPLE c.le.s #define CMPLT c.lt.s +#define PLU plu.ps +#define PLL pll.ps +#define PUU puu.ps +#define PUL pul.ps +#define MADPS madd.ps +#define CVTU cvt.s.pu +#define CVTL cvt.s.pl +#define NEG neg.s #endif #if defined(__64BIT__) && defined(USE64BITINT) @@ -218,7 +227,7 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 8 << 20) +#define BUFFER_SIZE ( 32 << 20) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index ba54612eb..f9007f831 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; - queue[num_cpu].sa = NULL; +#if defined(LOONGSON3A) + queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; + queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; +#else + queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; +#endif queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { +#if defined(LOONGSON3A) queue[0].sa = sa; - queue[0].sb = sb; - + queue[0].sb = sa + GEMM_OFFSET_A1 * 5; +#else + queue[0].sa = sa; + queue[0].sb = sb; +#endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c0f77c4c9..66067a05c 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ /* Initializing routine */ int blas_thread_init(void){ BLASLONG i; + int ret; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif @@ -545,12 +546,16 @@ int blas_thread_init(void){ pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR - pthread_create(&blas_threads[i], &attr, + ret=pthread_create(&blas_threads[i], &attr, (void *)&blas_thread_server, (void *)i); #else - pthread_create(&blas_threads[i], NULL, + ret=pthread_create(&blas_threads[i], NULL, (void *)&blas_thread_server, (void *)i); #endif + if(ret!=0){ + fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); + exit(1); + } } #ifdef MONITOR @@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4fd4cd440..c45856fd9 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/memory.c b/driver/others/memory.c index dd8334477..ac9c87850 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 9e72fd24f..fc7f0447e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -45,8 +45,22 @@ int get_L2_size(void); #define DEFAULT_GEMM_P 128 #define DEFAULT_GEMM_Q 128 #define DEFAULT_GEMM_R 128 +#define DEFAULT_GEMM_OFFSET_A 0 +#define DEFAULT_GEMM_OFFSET_B 0 /* Global Parameter */ +#if GEMM_OFFSET_A == gemm_offset_a +BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; +#else +BLASLONG gemm_offset_a = GEMM_OFFSET_A; +#endif + +#if GEMM_OFFSET_B == gemm_offset_b +BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; +#else +BLASLONG gemm_offset_b = GEMM_OFFSET_B; +#endif + #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -666,3 +680,21 @@ void blas_set_parameter(void){ #endif #endif + +#if defined(ARCH_MIPS64) +void blas_set_parameter(void){ +#if defined(LOONGSON3A) +#ifdef SMP + if(blas_num_threads == 1){ +#endif + //single thread + dgemm_r = 1024; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 200; + } +#endif +#endif +} +#endif diff --git a/interface/symm.c b/interface/symm.c index a0d52c49d..b447f13e8 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index ebb447b11..6afb2cf13 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT DTRSMKERNEL_RT = trsm_kernel_RT.S endif +ifndef CTRSMKERNEL_LN CTRSMKERNEL_LN = ztrsm_kernel_LT.S -CTRSMKERNEL_LT = ztrsm_kernel_LT.S -CTRSMKERNEL_RN = ztrsm_kernel_LT.S -CTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif +ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif + +ifndef ZTRSMKERNEL_LN ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_LT ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RN ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RT ZTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index e72ac142e..fc247e473 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,18 +1,48 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S -SGEMMKERNEL = sgemm_kernel_loongson3a.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c + + +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = gemm_kernel_loongson3a.S +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/mips64/cgemm_kernel_loongson3a_2x2.S b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S new file mode 100644 index 000000000..5ded7aed0 --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S @@ -0,0 +1,1468 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) + + MOV c41, c11 + MOV c42, c11 + FETCH $0, 4 * SIZE(CO2) + + MOV c43, c11 + NOP + FETCH $0, 4 * SIZE(CO1) + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddiu AO, AO, 16 * SIZE + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + daddu PREB, PREB, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + NOP + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD1 c31, c31, a7, b7 # A1xB2 + NOP + + MADD3 c33, c33, a7, b8 + daddiu PREB, PREB, 16 * SIZE + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L36 + NOP + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu PREA, PREA, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + +# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + +# gsLQC1(R13, F13, F12, 2) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + +# gsLQC1(R12, F11, F10, 3) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + daddiu L, L, -1 + +# gsLQC1(R13, F16, F15, 3) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S new file mode 100644 index 000000000..e78ad209f --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -0,0 +1,4026 @@ +##define REALNAME gemm +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define STACKSIZE 192 +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +##### Parameter registers #### +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#### Pointer A, B, C #### +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 + +#define PREA $18 +#define PREB $19 + +#### Used registers #### +#define A1 $f0 +#define A2 $f1 +#define A3 $f2 +#define A4 $f3 +#define A5 $f4 +#define A6 $f5 +#define A7 $f6 +#define A8 $f7 + +#define B1 $f8 +#define B2 $f9 +#define B3 $f10 +#define B4 $f11 +#define B5 $f12 +#define B6 $f13 +#define B7 $f14 +#define B8 $f15 + +#define C11 $f16 +#define C12 $f17 +#define C21 $f18 +#define C22 $f19 +#define C31 $f20 +#define C32 $f21 +#define C41 $f22 +#define C42 $f23 +#define C13 $f24 +#define C14 $f25 +#define C23 $f26 +#define C24 $f27 +#define C33 $f28 +#define C34 $f29 +#define C43 $f30 +#define C44 $f31 + +#define I $2 +#define J $3 +#define L $7 + +#### Alpha register #### +#define ALPHA $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + +#define R12 12 +#define R13 13 + +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp,$sp,-STACKSIZE + + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + +#if defined(TRMMKERNEL) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + + LDARG OFFSET, STACKSIZE+8($sp) +#endif + +#ifndef __64BIT__ + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) +#endif + + .align 4 +.L2: + dsra J, N, 1 # NR=2 + ST $f15, 152($sp) + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE + blez J, .L1 + ST $f16, 160($sp) + +.L24: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + dsra I, M, 2 # MR=8 + move AO, A # Reset A + + dsll PREA, K, 1 + ZBASE_SHIFT + move CO1, C + + daddu CO2, C, LDC + daddu PREA, AO, PREA + + blez I, .L22 + daddu C, CO2, LDC + + .align 4 +.L241: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, ZBASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MOV C31, C11 + MOV C32, C11 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C41, C11 + MOV C42, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 + + PLU B3, B1, B1 + PLU B4, B2, B2 + daddu PREB, BO, PREB + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + blez L, .L242 + NOP + +#else + + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, ZBASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MOV C31, C11 + MOV C32, C11 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C41, C11 + MOV C42, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C23, C11 + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 + daddu PREB, BO, PREB + + PLU B3, B1, B1 + PLU B4, B2, B2 + + FETCH $0, 8 * SIZE(CO1) + blez L, .L242 + FETCH $0, 8 * SIZE(CO2) +#endif + +.L2410: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + FETCH $0, 0 * SIZE(PREB) + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + FETCH $0, 0 * SIZE(PREA) + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddu PREB, PREB, 8 * SIZE + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 4) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 5) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + FETCH $0, 8 * SIZE(PREA) + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 6) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 7) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + + FETCH $0, 16 * SIZE(PREA) + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + FETCH $0, 24 * SIZE(PREA) + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddu PREA, PREA, 32 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + bgtz L, .L2410 + MADPS C44, C44, A8, B8 + + + .align 4 +.L242: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L247 + NOP + + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 4 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + .align 4 +.L247: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L240 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu BO, BO, 1 * 4 * SIZE # 4KR*4NR + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 2 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + + .align 4 +.L240: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, B5, C14 + SUB C24, B6, C24 + SUB C34, B7, C34 + SUB C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, C14, B5 + SUB C24, C24, B6 + SUB C34, C34, B7 + SUB C44, C44, B8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + NEG C13, C13 + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + NEG C14, C14 + NEG C24, C24 + NEG C34, C34 + NEG C44, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + LD C13, 0 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 + ST B1, 0 * SIZE(CO1) + + MADD C23, C23, C22, A1 + ST B3, 2 * SIZE(CO1) + + MADD C33, C33, C32, A1 + ST B5, 4 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B7, 6 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B2, 1 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B4, 3 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B6, 5 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B8, 7 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B3, 2 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B5, 4 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B7, 6 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B2, 1 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B4, 3 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B6, 5 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B8, 7 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, B5, C14 + SUB C24, B6, C24 + SUB C34, B7, C34 + SUB C44, B8, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B1, 0 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B3, 2 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B5, 4 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B7, 6 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B2, 1 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B4, 3 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B6, 5 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST B8, 7 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, C14, B5 + SUB C24, C24, B6 + + SUB C34, C34, B7 + SUB C44, C44, B8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B1, 0 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B3, 2 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B5, 4 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B7, 6 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B2, 1 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B4, 3 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B6, 5 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST B8, 7 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + NEG C13, C13 + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + NEG C14, C14 + NEG C24, C24 + NEG C34, C34 + NEG C44, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B3, 2 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B5, 4 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B7, 6 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B2, 1 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B4, 3 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B6, 5 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B8, 7 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 2 + ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + +#endif + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L241 + daddiu CO2, CO2, 8 * SIZE + + .align 4 +.L22: + andi I, M, 2 # MR=4 + blez I, .L21 + NOP + + .align 4 +.L221: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # MR=2 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L222 + NOP + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) + + PLU B3, B1, B1 + blez L, .L222 + PLU B4, B2, B2 +#endif + +.L2210: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A3, B5 + MADPS C21, C21, A4, B5 + + MADPS C12, C12, A3, B6 + MADPS C22, C22, A4, B6 + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + PLU B3, B1, B1 + PLU B4, B2, B2 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A5, B1 + MADPS C21, C21, A6, B1 + + MADPS C12, C12, A5, B2 + MADPS C22, C22, A6, B2 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A5, B3 + MADPS C23, C23, A6, B3 + + MADPS C14, C14, A5, B4 + MADPS C24, C24, A6, B4 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A7, B5 + MADPS C21, C21, A8, B5 + + MADPS C12, C12, A7, B6 + MADPS C22, C22, A8, B6 + + MADPS C13, C13, A7, B7 + MADPS C23, C23, A8, B7 + + MADPS C14, C14, A7, B8 + MADPS C24, C24, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L2210 + PLU B4, B2, B2 + + + .align 4 +.L222: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L227 + NOP + + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE + + daddiu AO, AO, 2 * 4 * SIZE + MADPS C11, C11, A3, B5 + MADPS C21, C21, A4, B5 + gsLQC1(R13, F9, F8, 0) # A1 A2 + + MADPS C12, C12, A3, B6 + MADPS C22, C22, A4, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L227: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L220 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 4 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + .align 4 +.L220: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + CVTU A5, C12 + CVTU A6, C22 + + CVTU A7, C14 + CVTU A8, C24 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, A7, C14 + SUB C24, A8, C24 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, C14, A7 + SUB C24, C24, A8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + NEG C13, C13 + NEG C23, C23 + NEG C14, C14 + NEG C24, C24 + + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + CVTU A5, C12 + CVTU A6, C22 + + CVTU A7, C14 + CVTU A8, C24 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, A7, C14 + SUB C24, A8, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, C14, A7 + SUB C24, C24, A8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + NEG C13, C13 + NEG C23, C23 + NEG C14, C14 + NEG C24, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + +#endif + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + + .align 4 +.L21: + andi I, M, 1 + blez I, .L20 + NOP + + .align 4 +.L211: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L212 + NOP + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + + PLU B3, B1, B1 + blez L, .L212 + PLU B4, B2, B2 +#endif + +.L2110: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A3, B1 + MADPS C12, C12, A3, B2 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + + daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A3, B3 + MADPS C14, C14, A3, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A4, B5 + MADPS C12, C12, A4, B6 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C13, C13, A4, B7 + MADPS C14, C14, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L2110 + PLU B4, B2, B2 + + + .align 4 +.L212: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L217 + NOP + + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + daddiu AO, AO, 4 * SIZE + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + gsLQC1(R12, F1, F0, 0) # A5 A6 + gsLQC1(R13, F9, F8, 0) # B1 B2 + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L217: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L210 + NOP + + MADPS C11, C11, A1, B1 + daddiu BO, BO, 4 * SIZE + MADPS C12, C12, A1, B2 + daddiu AO, AO, 2 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + .align 4 +.L210: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + CVTU A5, C12 + CVTU A7, C14 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + SUB C14, A7, C14 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + SUB C14, C14, A7 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + NEG C13, C13 + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + NEG C14, C14 + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + CVTU A5, C12 + CVTU A7, C14 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + SUB C14, A7, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + SUB C14, C14, A7 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + NEG C13, C13 + NEG C14, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + +#endif + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + + .align 4 +.L20: + daddiu J, J, -1 + move B, BO + +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L24 + NOP + + + .align 4 +.L1: + andi J, N, 1 + blez J, .L999 + NOP + +.L14: + dsra I, M, 2 # MR=8 + move AO, A # Reset A + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move CO1, C + blez I, .L12 + daddu C, CO1, LDC + + .align 4 +.L141: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C31, C11 + MOV C41, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C33, C11 + MOV C43, C11 + + FETCH $0, 8 * SIZE(CO1) + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 # define Mr=4 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L142 + NOP + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C31, C11 + MOV C41, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C33, C11 + MOV C43, C11 + + FETCH $0, 8 * SIZE(CO1) + PLU B3, B1, B1 + blez L, .L142 + PLU B4, B2, B2 +#endif + +.L1410: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + gsLQC1(R12, F1, F0, 4) # A1 A2 + MADPS C11, C11, A5, B2 + MADPS C21, C21, A6, B2 + + gsLQC1(R12, F3, F2, 5) # A3 A4 + MADPS C31, C31, A7, B2 + MADPS C41, C41, A8, B2 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C13, C13, A5, B4 + MADPS C23, C23, A6, B4 + + MADPS C33, C33, A7, B4 + MADPS C43, C43, A8, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A1, B5 + MADPS C21, C21, A2, B5 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + gsLQC1(R12, F7, F6, 7) # A7 A8 + MADPS C31, C31, A3, B5 + MADPS C41, C41, A4, B5 + + daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A1, B7 + MADPS C23, C23, A2, B7 + + MADPS C33, C33, A3, B7 + MADPS C43, C43, A4, B7 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A5, B6 + MADPS C21, C21, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B6 + MADPS C41, C41, A8, B6 + + MADPS C13, C13, A5, B8 + MADPS C23, C23, A6, B8 + + MADPS C33, C33, A7, B8 + MADPS C43, C43, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L1410 + PLU B4, B2, B2 + + + .align 4 +.L142: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L147 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + gsLQC1(R13, F13, F8, 1) # B3 B4 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A5, B2 + MADPS C21, C21, A6, B2 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B2 + MADPS C41, C41, A8, B2 + daddiu BO, BO, 4 * SIZE # 4KR*4NR + + MADPS C13, C13, A5, B4 + MADPS C23, C23, A6, B4 + + MADPS C33, C33, A7, B4 + MADPS C43, C43, A8, B4 + PLU B3, B1, B1 + + + .align 4 +.L147: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L140 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu AO, AO, 2 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + + .align 4 +.L140: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # AC'+'BD + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # LOAD ALPHA_R +# LD A1, 0 * SIZE(A) # LOAD ALPHA_R + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # LOAD ALPHA_I + + ADD C13, A5, C13 # AD'+'CB + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + NEG C13, C13 # AD'+'CB + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = ALPHA_R + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = ALPHA_I + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # AC'+'BD + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # LOAD ALPHA_R +# LD A1, 0 * SIZE(A) # LOAD ALPHA_R + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # LOAD ALPHA_I + + ADD C13, A5, C13 # AD'+'CB + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + NEG C13, C13 # AD'+'CB + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + + MUL B1, C11, A1 # A1 = ALPHA_R + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = ALPHA_I + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 2 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + +#endif + bgtz I, .L141 + daddiu CO1, CO1, 8 * SIZE + + .align 4 +.L12: + andi I, M, 2 # MR=4 + blez I, .L11 + NOP + + .align 4 +.L121: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L122 + NOP + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + + PLU B3, B1, B1 + blez L, .L122 + PLU B4, B2, B2 +#endif + +.L1210: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C11, C11, A3, B2 + MADPS C21, C21, A4, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C13, C13, A3, B4 + MADPS C23, C23, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A7, B6 + MADPS C21, C21, A8, B6 + + MADPS C13, C13, A7, B8 + MADPS C23, C23, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L1210 + PLU B4, B2, B2 + + + .align 4 +.L122: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L127 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + PLU B7, B5, B5 + daddiu BO, BO, 1 * 4 * SIZE + + daddiu AO, AO, 2 * 4 * SIZE + MADPS C11, C11, A3, B2 + MADPS C21, C21, A4, B2 + + MADPS C13, C13, A3, B4 + MADPS C23, C23, A4, B4 + + gsLQC1(R13, F9, F8, 0) + gsLQC1(R12, F1, F0, 0) + PLU B3, B1, B1 + + .align 4 +.L127: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L120 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + .align 4 +.L120: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + NEG C13, C13 # ad'+'cb + NEG C23, C23 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + NEG C13, C13 # ad'+'cb + NEG C23, C23 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + +#endif + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + + .align 4 +.L11: + andi I, M, 1 + blez I, .L10 + NOP + + .align 4 +.L111: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + + FETCH $0, 0 * SIZE(CO1) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L112 + NOP + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + + FETCH $0, 0 * SIZE(CO1) + + PLU B3, B1, B1 + blez L, .L112 + PLU B4, B2, B2 +#endif + +.L1110: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR + + MADPS C11, C11, A2, B2 + MADPS C13, C13, A2, B4 + + MADPS C11, C11, A3, B5 + MADPS C13, C13, A3, B7 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A4, B6 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C13, C13, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L1110 + PLU B4, B2, B2 + + + .align 4 +.L112: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L117 + NOP + + MADPS C11, C11, A1, B1 + MADPS C13, C13, A1, B3 + daddiu BO, BO, 4 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C11, C11, A2, B2 + MADPS C13, C13, A2, B4 + + gsLQC1(R13, F9, F8, 0) + gsLQC1(R12, F1, F0, 0) + PLU B3, B1, B1 + + + .align 4 +.L117: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L110 + NOP + + daddiu BO, BO, 2 * SIZE + daddiu AO, AO, 2 * SIZE + + MADPS C11, C11, A1, B1 + MADPS C13, C13, A1, B3 + + + .align 4 +.L110: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + NEG C13, C13 + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + NEG C13, C13 + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + +#endif + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + + .align 4 +.L10: + move B, BO +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 1 +#endif + +.L999: + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + +#if defined(TRMMKERNEL) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) +#endif + +#ifndef __64BIT__ + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) +#endif + + daddiu $sp,$sp,STACKSIZE + j $31 + nop + + EPILOGUE diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/dgemm_kernel_loongson3a_4x4.S similarity index 100% rename from kernel/mips64/gemm_kernel_loongson3a.S rename to kernel/mips64/dgemm_kernel_loongson3a_4x4.S diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c new file mode 100644 index 000000000..7db595449 --- /dev/null +++ b/kernel/mips64/gemv_n_loongson3a.c @@ -0,0 +1,101 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) +#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) +#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) +#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) +{ + + BLASLONG kx=0, ky=0; + if(!ALPHA) + return 0; + + //if(INCX < 0) + // kx = (1-N) * INCX; + // INCX = -INCX; + //if(INCY < 0) + // ky = (1-M) * INCY; + // INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 4; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCY == 1) { + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = ky; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCY == 1) { + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = ky; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c new file mode 100644 index 000000000..51f035d8e --- /dev/null +++ b/kernel/mips64/gemv_t_loongson3a.c @@ -0,0 +1,93 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) +#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) +#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) +#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!ALPHA) + return 0; + +// if(INCX < 0) +// INCX = -INCX; +// if(INCY < 0) +// INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 3; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S new file mode 100644 index 000000000..bc81d0eb5 --- /dev/null +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -0,0 +1,7797 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define STACKSIZE 160 +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +##### Parameter registers #### + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#### Pointer A, B, C #### +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define PREA $18 +#define PREB $19 + +#### Used registers #### +#define A1 $f0 +#define A2 $f1 +#define A3 $f2 +#define A4 $f3 +#define A5 $f4 +#define A6 $f5 +#define A7 $f6 +#define A8 $f7 + +#define B1 $f8 +#define B2 $f9 +#define B3 $f10 +#define B4 $f11 +#define B5 $f12 +#define B6 $f13 +#define B7 $f14 +#define B8 $f15 + +#define C11 $f16 +#define C12 $f17 +#define C21 $f18 +#define C22 $f19 +#define C31 $f20 +#define C32 $f21 +#define C41 $f22 +#define C42 $f23 +#define C13 $f24 +#define C14 $f25 +#define C23 $f26 +#define C24 $f27 +#define C33 $f28 +#define C34 $f29 +#define C43 $f30 +#define C44 $f31 + +#define I $2 +#define J $3 +#define L $7 + +#### Alpha register #### +#define ALPHA $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + +#define R12 12 +#define R13 13 + +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + +# .text +# .align 2 +## .globl gemm +# .set nomips16 +# .ent gemm +# .type gemm, @function +#gemm: +# .frame $sp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 +# .mask 0x40000000,-8 +# .fmask 0x00000000,0 +# .set noreorder +# .set nomacro + + + PROLOGUE + + daddiu $sp,$sp,-STACKSIZE + + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + +#if defined(TRMMKERNEL) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + + LDARG OFFSET, 160($sp) +#endif + +#ifndef __64BIT__ + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) +#endif + + .align 4 +.L4: + dsra J, N, 2 # NR=4 + dsll LDC, LDC, BASE_SHIFT# LDC*SIZE + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + blez J, .L2 + ST ALPHA, 152($sp) + +.L48: + dsra I, M, 3 # MR=8 + dsll PREA, K, BASE_SHIFT + + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + daddu CO3, CO2, LDC + + daddu CO4, CO3, LDC + daddu PREA, A, PREA + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + blez I, .L44 + daddu C, CO4, LDC + + .align 4 +.L481: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 3 + BASE_SHIFT # kk*8mr*datasize + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L # AO point to the data addr + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + FETCH $0, 4 * SIZE(CO3) + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + FETCH $0, 4 * SIZE(CO4) + +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK # TEMP is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 6 + blez L, .L482 + NOP +#else + # GEMM PART + move BO, B # Reset B + dsra L, K, 6 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + FETCH $0, 4 * SIZE(CO3) + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L482 + FETCH $0, 4 * SIZE(CO4) +#endif + +.L4810: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + bgtz L, .L4810 + MADPS C44, C44, A8, B8 + + .align 4 +.L482: +#ifndef TRMMKERNEL + andi L, K, 32 +#else + andi L, TEMP, 32 +#endif + blez L, .L483 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L483: +#ifndef TRMMKERNEL + andi L, K, 16 +#else + andi L, TEMP, 16 +#endif + blez L, .L484 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L484: +#ifndef TRMMKERNEL + andi L, K, 8 +#else + andi L, TEMP, 8 +#endif + blez L, .L485 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L485: +#ifndef TRMMKERNEL + andi L, K, 4 +#else + andi L, TEMP, 4 +#endif + blez L, .L486 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L486: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L487 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 8 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 16 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 8 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + daddiu PREA, PREA, 16 * SIZE + + + .align 4 +.L487: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L480 + LD ALPHA, 152($sp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 4 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 8 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + + + .align 4 +.L480: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + + CVTU A3, C23 # A3=C23.upper=c14 + LD B1, 1 * SIZE(CO1) + + CVTU A4, C21 # A4=C21.upper=c24 + LD B2, 1 * SIZE(CO2) + + CVTU A5, C33 # A5=C33.upper=c16 + LD B3, 3 * SIZE(CO1) + + CVTU A6, C31 # A6=C31.upper=c26 + LD B4, 3 * SIZE(CO2) + + CVTU A7, C43 # A7=C43.upper=c18 + LD B5, 5 * SIZE(CO1) + + CVTU A8, C41 # A8=C41.upper=c28 + LD B6, 5 * SIZE(CO2) + + MADD A1, B1, A1, ALPHA # c12 + LD B7, 7 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B1, 7 * SIZE(CO2) + + MADD A3, B3, A3, ALPHA # c14 + LD B2, 0 * SIZE(CO1) + + MADD A4, B4, A4, ALPHA # c24 + LD B3, 0 * SIZE(CO2) + + MADD A5, B5, A5, ALPHA # c16 + LD B4, 2 * SIZE(CO1) + + MADD A6, B6, A6, ALPHA # c26 + LD B5, 2 * SIZE(CO2) + + MADD A7, B7, A7, ALPHA # c18 + LD B6, 4 * SIZE(CO1) + + MADD A8, B1, A8, ALPHA # c28 + ST A1, 1 * SIZE(CO1) + + MADD C11, B2, C11, ALPHA # c12 + LD B7, 4 * SIZE(CO2) + + MADD C13, B3, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MADD C21, B4, C21, ALPHA # c14 + LD A1, 6 * SIZE(CO1) + + MADD C23, B5, C23, ALPHA # c24 + ST A3, 3 * SIZE(CO1) + + MADD C31, B6, C31, ALPHA # c16 + LD A2, 6 * SIZE(CO2) + + MADD C33, B7, C33, ALPHA # c26 + ST A4, 3 * SIZE(CO2) + + ST A5, 5 * SIZE(CO1) + ST A6, 5 * SIZE(CO2) + ST A7, 7 * SIZE(CO1) + ST A8, 7 * SIZE(CO2) + + MADD C41, A1, C41, ALPHA # c18 + ST C11, 0 * SIZE(CO1) + + MADD C43, A2, C43, ALPHA # c28 + ST C13, 0 * SIZE(CO2) + + ST C21, 2 * SIZE(CO1) + ST C23, 2 * SIZE(CO2) + ST C31, 4 * SIZE(CO1) + ST C33, 4 * SIZE(CO2) + ST C41, 6 * SIZE(CO1) + + CVTU A1, C14 # B1=C12.upper=c42 + ST C43, 6 * SIZE(CO2) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B1, 1 * SIZE(CO3) + + CVTU A3, C24 # B3=C22.upper=c44 + LD B2, 1 * SIZE(CO4) + + CVTU A4, C22 # B4=C24.upper=c34 + LD B3, 3 * SIZE(CO3) + + CVTU A5, C34 # B5=C32.upper=c46 + LD B4, 3 * SIZE(CO4) + + CVTU A6, C32 # B6=C24.upper=c36 + LD B5, 5 * SIZE(CO3) + + CVTU A7, C44 # B7=C42.upper=c48 + LD B6, 5 * SIZE(CO4) + + CVTU A8, C42 # A1=C44.upper=c38 + LD B7, 7 * SIZE(CO3) + + MADD A1, B1, A1, ALPHA # c31 + LD C11, 7 * SIZE(CO4) + + MADD A2, B2, A2, ALPHA + LD C13, 0 * SIZE(CO3) + + MADD A3, B3, A3, ALPHA + LD C21, 0 * SIZE(CO4) + + MADD A4, B4, A4, ALPHA + LD C23, 2 * SIZE(CO3) + + MADD A5, B5, A5, ALPHA + LD C31, 2 * SIZE(CO4) + + MADD A6, B6, A6, ALPHA + LD C33, 4 * SIZE(CO3) + + MADD A7, B7, A7, ALPHA + LD C41, 4 * SIZE(CO4) + + MADD A8, C11, A8, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C12, C13, C12, ALPHA + LD C43, 6 * SIZE(CO3) + + MADD C14, C21, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MADD C22, C23, C22, ALPHA + LD B1, 6 * SIZE(CO4) + + MADD C24, C31, C24, ALPHA + ST A3, 3 * SIZE(CO3) + + MADD C32, C33, C32, ALPHA + ST A4, 3 * SIZE(CO4) + + MADD C34, C41, C34, ALPHA + ST A5, 5 * SIZE(CO3) + + MADD C42, C43, C42, ALPHA + ST A6, 5 * SIZE(CO4) + + ST A7, 7 * SIZE(CO3) + NOP + + MADD C44, B1, C44, ALPHA + ST A8, 7 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + ST C32, 4 * SIZE(CO3) + ST C34, 4 * SIZE(CO4) + ST C42, 6 * SIZE(CO3) + ST C44, 6 * SIZE(CO4) + + daddiu CO1, CO1, 8 * SIZE + daddiu CO2, CO2, 8 * SIZE + daddiu CO3, CO3, 8 * SIZE + bgtz I, .L481 + daddiu CO4, CO4, 8 * SIZE +#else + daddiu I, I, -1 + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + CVTU A3, C23 # A3=C23.upper=c14 + CVTU A4, C21 # A4=C21.upper=c24 + CVTU A5, C33 # A5=C33.upper=c16 + CVTU A6, C31 # A6=C31.upper=c26 + CVTU A7, C43 # A7=C43.upper=c18 + CVTU A8, C41 # A8=C41.upper=c28 + + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + MUL A3, A3, ALPHA # c14 + MUL A4, A4, ALPHA # c24 + MUL A5, A5, ALPHA # c16 + MUL A6, A6, ALPHA # c26 + MUL A7, A7, ALPHA # c18 + MUL A8, A8, ALPHA # c28 + + MUL C11, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MUL C13, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MUL C21, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MUL C23, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + MUL C31, C31, ALPHA # c16 + ST A5, 5 * SIZE(CO1) + + MUL C33, C33, ALPHA # c26 + ST A6, 5 * SIZE(CO2) + + MUL C41, C41, ALPHA # c18 + ST A7, 7 * SIZE(CO1) + + MUL C43, C43, ALPHA # c28 + ST A8, 7 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + ST C11, 0 * SIZE(CO1) + + CVTU A2, C12 # B2=C14.upper=c32 + ST C13, 0 * SIZE(CO2) + + CVTU A3, C24 # B3=C22.upper=c44 + ST C21, 2 * SIZE(CO1) + + CVTU A4, C22 # B4=C24.upper=c34 + ST C23, 2 * SIZE(CO2) + + CVTU A5, C34 # B5=C32.upper=c46 + ST C31, 4 * SIZE(CO1) + + CVTU A6, C32 # B6=C24.upper=c36 + ST C33, 4 * SIZE(CO2) + + CVTU A7, C44 # B7=C42.upper=c48 + ST C41, 6 * SIZE(CO1) + + CVTU A8, C42 # A1=C44.upper=c38 + ST C43, 6 * SIZE(CO2) + + MUL A1, A1, ALPHA # c31 + MUL A2, A2, ALPHA + MUL A3, A3, ALPHA + MUL A4, A4, ALPHA + MUL A5, A5, ALPHA + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA + + MUL C12, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MUL C14, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MUL C22, C22, ALPHA + ST A3, 3 * SIZE(CO3) + + MUL C24, C24, ALPHA + ST A4, 3 * SIZE(CO4) + + MUL C32, C32, ALPHA + ST A5, 5 * SIZE(CO3) + + MUL C34, C34, ALPHA + ST A6, 5 * SIZE(CO4) + + MUL C42, C42, ALPHA + ST A7, 7 * SIZE(CO3) + + MUL C44, C44, ALPHA + ST A8, 7 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + ST C32, 4 * SIZE(CO3) + ST C34, 4 * SIZE(CO4) + ST C42, 6 * SIZE(CO3) + ST C44, 6 * SIZE(CO4) + + daddiu CO1, CO1, 8 * SIZE + daddiu CO2, CO2, 8 * SIZE + daddiu CO3, CO3, 8 * SIZE + daddiu CO4, CO4, 8 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + + bgtz I, .L481 + NOP +#endif + + .align 4 +.L44: + andi I, M, 4 # MR=4 + blez I, .L42 + NOP + + .align 4 +.L441: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + PLU B4, B2, B2 + +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddu TEMP, KK, 4 +#else + daddu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L442 + NOP + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + blez L, .L442 + PLU B4, B2, B2 +#endif + +.L4410: # + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A5, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C21, C21, A6, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C12, C12, A5, B2 + FETCH $0, 8 * SIZE(PREB) + daddiu BO, BO, 16 * SIZE # 4KR*4NR + + MADPS C22, C22, A6, B2 + FETCH $0, 8 * SIZE(PREA) + daddiu AO, AO, 16 * SIZE # 4KR*4MR + + MADPS C13, C13, A5, B3 + MADPS C23, C23, A6, B3 + + MADPS C14, C14, A5, B4 + MADPS C24, C24, A6, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A7, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A8, B5 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C12, C12, A7, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C22, C22, A8, B6 + FETCH $0, 12 * SIZE(PREA) + + MADPS C13, C13, A7, B7 + daddiu PREA, PREA, 16 * SIZE + MADPS C23, C23, A8, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C14, C14, A7, B8 + MADPS C24, C24, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L4410 + PLU B4, B2, B2 + + .align 4 +.L442: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L443 + NOP + + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + daddiu BO, BO, 8 * SIZE # 2KR*4NR + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + daddiu AO, AO, 8 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F1, F0, 0) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + daddiu PREB, PREB, 8 + MADPS C23, C23, A4, B7 + daddiu PREA, PREA, 8 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L443: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L440 + LD ALPHA, 152($sp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 1KR*4NR + MADPS C22, C22, A2, B2 + daddiu AO, AO, 4 * SIZE # 1KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + + .align 4 +.L440: +#ifndef TRMMKERNEL + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + CVTU A3, C23 # A3=C23.upper=c14 + LD B3, 3 * SIZE(CO1) + + CVTU A4, C21 # A4=C21.upper=c24 + LD B4, 3 * SIZE(CO2) + + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD A3, B3, A3, ALPHA # c14 + LD B7, 2 * SIZE(CO1) + + MADD A4, B4, A4, ALPHA # c24 + LD B1, 2 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MADD C21, B7, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MADD C23, B1, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C21, 2 * SIZE(CO1) + ST C23, 2 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + CVTU A3, C24 # B3=C22.upper=c44 + LD B3, 3 * SIZE(CO3) + + CVTU A4, C22 # B4=C24.upper=c34 + LD B4, 3 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD A3, B3, A3, ALPHA + LD A7, 2 * SIZE(CO3) + + MADD A4, B4, A4, ALPHA + LD A8, 2 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MADD C22, A7, C22, ALPHA + ST A3, 3 * SIZE(CO3) + + MADD C24, A8, C24, ALPHA + ST A4, 3 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + +#else + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + CVTU A3, C23 # A3=C23.upper=c14 + CVTU A4, C21 # A4=C21.upper=c24 + + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + MUL A3, A3, ALPHA # c14 + MUL A4, A4, ALPHA # c24 + + MUL C11, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MUL C13, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MUL C21, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MUL C23, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + CVTU A5, C14 # B1=C12.upper=c42 + ST C11, 0 * SIZE(CO1) + + CVTU A6, C12 # B2=C14.upper=c32 + ST C13, 0 * SIZE(CO2) + + CVTU A7, C24 # B3=C22.upper=c44 + ST C21, 2 * SIZE(CO1) + + CVTU A8, C22 # B4=C24.upper=c34 + ST C23, 2 * SIZE(CO2) + + MUL A5, A5, ALPHA # c31 + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA + + MUL C12, C12, ALPHA + ST A5, 1 * SIZE(CO3) + + MUL C14, C14, ALPHA + ST A6, 1 * SIZE(CO4) + + MUL C22, C22, ALPHA + ST A7, 3 * SIZE(CO3) + + MUL C24, C24, ALPHA + ST A8, 3 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif + + .align 4 +.L42: + andi I, M, 2 + blez I, .L41 + NOP + + .align 4 +.L421: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L422 + NOP + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + blez L, .L422 + PLU B4, B2, B2 +#endif + +.L4210: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + gsLQC1(R12, F3, F2, 1) # B1 B2 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + daddiu AO, AO, 8 * SIZE # 4KR*2MR + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A3, B1 + gsLQC1(R12, F1, F0, 0) # B3 B4 + + MADPS C12, C12, A3, B2 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C13, C13, A3, B3 + MADPS C14, C14, A3, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A4, B5 + MADPS C12, C12, A4, B6 + gsLQC1(R13, F9, F8, 0) # B3 B4 + + MADPS C13, C13, A4, B7 + MADPS C14, C14, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L4210 + PLU B4, B2, B2 + + .align 4 +.L422: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L423 + NOP + + daddiu AO, AO, 4 * SIZE # 2KR*2MR + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + daddiu BO, BO, 8 * SIZE # 2KR*2MR + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + gsLQC1(R12, F1, F0, 0) + + PLU B3, B1, B1 + PLU B4, B2, B2 + +.L423: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L420 + LD ALPHA, 152($sp) + + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 2KR*4NR + daddiu AO, AO, 2 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + .align 4 +.L420: +#ifndef TRMMKERNEL + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#else + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + + MUL C11, C11, ALPHA # c12 + MUL C13, C13, ALPHA # c22 + + CVTU A3, C14 # B1=C12.upper=c42 + CVTU A4, C12 # B2=C14.upper=c32 + + MUL A3, A3, ALPHA # c31 + ST A1, 1 * SIZE(CO1) + + MUL A4, A4, ALPHA + ST A2, 1 * SIZE(CO2) + + MUL C12, C12, ALPHA + ST C11, 0 * SIZE(CO1) + + MUL C14, C14, ALPHA + ST C13, 0 * SIZE(CO2) + + ST A3, 1 * SIZE(CO3) + ST A4, 1 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 4 +.L41: + andi I, M, 1 + blez I, .L40 + NOP + + .align 4 +.L411: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD B1, 0 * SIZE(BO) + + MOV C21, C11 + MOV C22, C11 + LD A1, 0 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B2, 1 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B3, 2 * SIZE(BO) + + MOV C13, C11 + MOV C14, C11 + LD B4, 3 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA))||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L412 + +#else + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD B1, 0 * SIZE(BO) + + MOV C21, C11 + MOV C22, C11 + LD A1, 0 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B2, 1 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B3, 2 * SIZE(BO) + + MOV C13, C11 + MOV C14, C11 + LD B4, 3 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L412 + MOV C44, C11 +#endif + +.L4110: + daddiu L, L, -1 + LD A2, 1 * SIZE(AO) + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A3, 2 * SIZE(AO) + NOP + + MADD C11, C11, A2, B5 + LD B1, 8 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 9 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 10 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 11 * SIZE(BO) + + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B1 + LD B5, 12 * SIZE(BO) + + MADD C12, C12, A3, B2 + LD B6, 13 * SIZE(BO) + + MADD C13, C13, A3, B3 + LD B7, 14 * SIZE(BO) + + MADD C14, C14, A3, B4 + LD B8, 15 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD C11, C11, A4, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A4, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A4, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A4, B8 + bgtz L, .L4110 + LD B4, 3 * SIZE(BO) + +.L412: +#ifndef TRMMKERNEL + andi L, K, 2 +#else + andi L, TEMP, 2 +#endif + blez L, .L413 + NOP + + LD A2, 1 * SIZE(AO) + daddiu AO, AO, 2 * SIZE + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 8 * SIZE + + MADD C11, C11, A2, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 3 * SIZE(BO) + +.L413: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L410 + LD ALPHA, 152($sp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + MADD C13, C13, A1, B3 + MADD C14, C14, A1, B4 + daddiu BO, BO, 4 * SIZE + + .align 4 +.L410: +#ifndef TRMMKERNEL + LD A5, 0 * SIZE(CO1) + LD A6, 0 * SIZE(CO2) + LD A7, 0 * SIZE(CO3) + LD A8, 0 * SIZE(CO4) + + MADD A5, A5, C11, ALPHA + MADD A6, A6, C12, ALPHA + MADD A7, A7, C13, ALPHA + MADD A8, A8, C14, ALPHA + + ST A5, 0 * SIZE(CO1) + ST A6, 0 * SIZE(CO2) + ST A7, 0 * SIZE(CO3) + ST A8, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE +#else + MUL A5, C11, ALPHA + MUL A6, C12, ALPHA + MUL A7, C13, ALPHA + MUL A8, C14, ALPHA + + ST A5, 0 * SIZE(CO1) + ST A6, 0 * SIZE(CO2) + ST A7, 0 * SIZE(CO3) + ST A8, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 4 +.L40: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 4 +#endif + daddiu J, J, -1 + move B, BO + bgtz J, .L48 + NOP + + + + .align 4 +.L2: # Nr=2 + andi J, N, 2 + blez J, .L1 + NOP + +.L28: + dsra I, M, 3 # MR=8 + + move AO, A # Reset A + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + daddu CO2, C, LDC + blez I, .L24 + daddu C, CO2, LDC + + .align 4 +.L281: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 3 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C13, C11 + LD A7, 6 * SIZE(AO) + + MOV C14, C11 + LD A8, 7 * SIZE(AO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L282 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C13, C11 + LD A7, 6 * SIZE(AO) + + MOV C14, C11 + LD A8, 7 * SIZE(AO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L282 + MOV C44, C11 +#endif + + .align 4 +.L2810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + LD B3, 2 * SIZE(BO) + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + LD A2, 13 * SIZE(AO) + + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + LD A3, 14 * SIZE(AO) + + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B3 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B3 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B3 + MADD C23, C23, A2, B3 + LD A7, 6 * SIZE(AO) + + MADD C33, C33, A3, B3 + MADD C43, C43, A4, B3 + LD A8, 7 * SIZE(AO) + + MADD C14, C14, A1, B4 + MADD C24, C24, A2, B4 + LD B1, 0 * SIZE(BO) + + MADD C34, C34, A3, B4 + MADD C44, C44, A4, B4 + LD B2, 1 * SIZE(BO) + + MADD C31, C31, B7, B3 + MADD C41, C41, B8, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, B5, B4 + LD A2, 1 * SIZE(AO) + + MADD C22, C22, B6, B4 + LD A3, 2 * SIZE(AO) + + LD A4, 3 * SIZE(AO) + MADD C32, C32, B7, B4 + bgtz L, .L2810 + MADD C42, C42, B8, B4 + + .align 4 +.L282: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L280 + LD ALPHA, 152($sp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L280: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + LD A5, 4 * SIZE(CO1) + LD A6, 5 * SIZE(CO1) + LD A7, 6 * SIZE(CO1) + LD A8, 7 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD A5, A5, C13, ALPHA + LD B5, 4 * SIZE(CO2) + + MADD A6, A6, C23, ALPHA + LD B6, 5 * SIZE(CO2) + + MADD A7, A7, C33, ALPHA + LD B7, 6 * SIZE(CO2) + + MADD A8, A8, C43, ALPHA + LD C11, 7 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + MADD B5, B5, C14, ALPHA + ST A5, 4 * SIZE(CO1) + + MADD B6, B6, C24, ALPHA + ST A6, 5 * SIZE(CO1) + + MADD B7, B7, C34, ALPHA + ST A7, 6 * SIZE(CO1) + + MADD C11, C11, C44, ALPHA + ST A8, 7 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + ST B5, 4 * SIZE(CO2) + ST B6, 5 * SIZE(CO2) + ST B7, 6 * SIZE(CO2) + ST C11, 7 * SIZE(CO2) + + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L281 + daddiu CO2, CO2, 8 * SIZE +#else + daddiu I, I, -1 + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + MUL A5, C13, ALPHA + MUL A6, C23, ALPHA + MUL A7, C33, ALPHA + MUL A8, C43, ALPHA + + MUL B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MUL B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MUL B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MUL B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + MUL B5, C14, ALPHA + ST A5, 4 * SIZE(CO1) + + MUL B6, C24, ALPHA + ST A6, 5 * SIZE(CO1) + + MUL B7, C34, ALPHA + ST A7, 6 * SIZE(CO1) + + MUL C11, C44, ALPHA + ST A8, 7 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + ST B5, 4 * SIZE(CO2) + ST B6, 5 * SIZE(CO2) + ST B7, 6 * SIZE(CO2) + ST C11, 7 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L281 + daddiu CO2, CO2, 8 * SIZE +#endif + + + .align 4 +.L24: + andi I, M, 4 # MR=4 + blez I, .L22 + NOP + + .align 4 +.L241: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + LD B2, 1 * SIZE(BO) + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L242 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + LD B2, 1 * SIZE(BO) + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L242 + MOV C44, C11 +#endif + + .align 4 +.L2410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD B4, 3 * SIZE(BO) + + MADD C41, C41, A4, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 4 * SIZE + + MADD C12, C12, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C22, C22, A2, B2 + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD B2, 1 * SIZE(BO) + + MADD C41, C41, A8, B3 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A5, B4 + LD A3, 2 * SIZE(AO) + + MADD C22, C22, A6, B4 + LD A4, 3 * SIZE(AO) + + MADD C32, C32, A7, B4 + bgtz L, .L2410 + MADD C42, C42, A8, B4 + + .align 4 +.L242: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L240 + LD ALPHA, 152($sp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L240: # Write Back +#ifndef TRMMKERNEL + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE +#else + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + + MUL B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MUL B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MUL B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MUL B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif + + .align 4 +.L22: + andi I, M, 2 + blez I, .L21 + NOP + + .align 4 +.L221: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L222 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L222 + MOV C44, C11 +#endif + + + .align 4 +.L2210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A3, 2 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C22, C22, A2, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A4, B3 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A3, B4 + LD B2, 1 * SIZE(BO) + + MADD C22, C22, A4, B4 + bgtz L, .L2210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L222: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L220 + LD ALPHA, 152($sp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L220: # Write Back +#ifndef TRMMKERNEL + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE +#else + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL B1, C12, ALPHA + MUL B2, C22, ALPHA + + ST A1, 0 * SIZE(CO1) + ST A2, 1 * SIZE(CO1) + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddu KK, KK, 2 +#endif +#endif + + .align 4 +.L21: + andi I, M, 1 + blez I, .L20 + NOP + + .align 4 +.L211: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L212 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L212 + MOV C44, C11 +#endif + + .align 4 +.L2110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A1, B2 + LD B3, 2 * SIZE(BO) + + LD B4, 3 * SIZE(BO) + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A2, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, A2, B4 + LD B1, 0 * SIZE(BO) + + bgtz L, .L2110 + LD B2, 1 * SIZE(BO) + + + .align 4 +.L212: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L210 + LD ALPHA, 152($sp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L210: # Write Back +#ifndef TRMMKERNEL + LD A1, 0 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#else + + MUL A1, C11, ALPHA + MUL B1, C12, ALPHA + + ST A1, 0 * SIZE(CO1) + ST B1, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 4 +.L20: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move B, BO + + + + .align 4 +.L1: + andi J, N, 1 + blez J, .L999 + NOP + +.L18: + dsra I, M, 3 # MR=8 + move AO, A # Reset A + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + blez I, .L14 + NOP + + + .align 4 +.L181: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 3 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD A7, 6 * SIZE(AO) + + MOV C13, C11 + LD A8, 7 * SIZE(AO) + + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L182 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD A7, 6 * SIZE(AO) + + MOV C13, C11 + LD A8, 7 * SIZE(AO) + + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L182 + MOV C44, C11 +#endif + + + .align 4 +.L1810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C13, C13, A5, B1 + LD B2, 1 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + LD A2, 13 * SIZE(AO) + + MADD C43, C43, A8, B1 + LD A3, 14 * SIZE(AO) + + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B2 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B2 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C23, C23, A2, B2 + LD A8, 7 * SIZE(AO) + + MADD C33, C33, A3, B2 + LD B1, 0 * SIZE(BO) + + MADD C43, C43, A4, B2 + LD A1, 0 * SIZE(AO) + + MADD C31, C31, B7, B2 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, B8, B2 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1810 + LD A4, 3 * SIZE(AO) + + .align 4 +.L182: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L180 + LD ALPHA, 152($sp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L180: # Write Back +#ifndef TRMMKERNEL + daddiu I, I, -1 + + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + LD A5, 4 * SIZE(C) + LD A6, 5 * SIZE(C) + LD A7, 6 * SIZE(C) + LD A8, 7 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + MADD A5, A5, C13, ALPHA + MADD A6, A6, C23, ALPHA + MADD A7, A7, C33, ALPHA + MADD A8, A8, C43, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + ST A5, 4 * SIZE(C) + ST A6, 5 * SIZE(C) + ST A7, 6 * SIZE(C) + ST A8, 7 * SIZE(C) + + daddiu C, C, 8 * SIZE + bgtz I, .L181 + NOP +#else + daddiu I, I, -1 + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + MUL A5, C13, ALPHA + MUL A6, C23, ALPHA + MUL A7, C33, ALPHA + MUL A8, C43, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + ST A5, 4 * SIZE(C) + ST A6, 5 * SIZE(C) + ST A7, 6 * SIZE(C) + ST A8, 7 * SIZE(C) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK + +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + + daddiu C, C, 8 * SIZE + bgtz I, .L181 + NOP +#endif + + .align 4 +.L14: + andi I, M, 4 # MR=4 + blez I, .L12 + NOP + + .align 4 +.L141: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L142 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L142 + MOV C44, C11 +#endif + + .align 4 +.L1410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 1 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 2 * SIZE + + MADD C41, C41, A4, B1 + LD A7, 6 * SIZE(AO) + + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, A8, B3 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1410 + LD A4, 3 * SIZE(AO) + + .align 4 +.L142: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L140 + LD ALPHA, 152($sp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L140: # Write Back +#ifndef TRMMKERNEL + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + daddiu C, C, 4 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + daddiu C, C, 4 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif + + .align 4 +.L12: + andi I, M, 2 + blez I, .L11 + NOP + + .align 4 +.L121: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L122 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + + MOV C43, C11 + blez L, .L122 + MOV C44, C11 +#endif + + .align 4 +.L1210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B3, 1 * SIZE(BO) + + MADD C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + + LD A3, 2 * SIZE(AO) + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD B1, 0 * SIZE(BO) + + MADD C21, C21, A4, B3 + LD A1, 0 * SIZE(AO) + bgtz L, .L1210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L122: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L120 + LD ALPHA, 152($sp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L120: # Write Back +#ifndef TRMMKERNEL + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + + daddiu C, C, 2 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + + daddiu C, C, 2 * SIZE +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + .align 4 +.L11: + andi I, M, 1 + blez I, .L10 + NOP + + .align 4 +.L111: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, BASE_SHIFT + daddu AO, AO, L + daddu BO, B, L +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD B1, 0 * SIZE(BO) + + MOV C31, C11 + MOV C32, C11 +#if (defined(LEFT) && !defined(TRANSA))||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L112 + NOP + +#else + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD B1, 0 * SIZE(BO) + + MOV C31, C11 + blez L, .L112 + MOV C32, C11 +#endif + + + .align 4 +.L1110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + + LD A2, 1 * SIZE(AO) + LD B2, 1 * SIZE(BO) + + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + MADD C11, C11, A2, B2 + LD A1, 0 * SIZE(AO) + LD B1, 0 * SIZE(BO) + + bgtz L, .L1110 + NOP + + + .align 4 +.L112: +#ifndef TRMMKERNEL + andi L, K, 1 +#else + andi L, TEMP, 1 +#endif + blez L, .L110 + LD ALPHA, 152($sp) + + MADD C11, C11, A1, B1 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L110: # Write Back +#ifndef TRMMKERNEL + LD A1, 0 * SIZE(C) + + MADD A1, A1, C11, ALPHA + + ST A1, 0 * SIZE(C) + + daddiu C, C, 1 * SIZE +#else + MUL A1, C11, ALPHA + + ST A1, 0 * SIZE(C) + + daddiu C, C, 1 * SIZE + +#endif + + .align 4 +.L10: + move B, BO + NOP + +.L999: + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + +#if defined(TRMMKERNEL) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) +#endif + +#ifndef __64BIT__ + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) +#endif + + daddiu $sp,$sp,STACKSIZE + j $31 + nop + + EPILOGUE +# .set macro +# .set reorder +# .end gemm +# .size gemm, .-gemm +# .ident "GCC: (Debian 4.4.6-6) 4.4.6" diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a_4x4.S similarity index 100% rename from kernel/mips64/sgemm_kernel_loongson3a.S rename to kernel/mips64/sgemm_kernel_loongson3a_4x4.S diff --git a/kernel/mips64/zgemm_kernel_loongson3a_2x2.S b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S new file mode 100644 index 000000000..a8faad2f6 --- /dev/null +++ b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S @@ -0,0 +1,1355 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2 + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, c11 # Clear results regs + MOV c12, c11 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + + MOV c13, c11 + MOV c14, c11 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MOV c21, c11 + MOV c22, c11 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + + MOV c23, c11 + MOV c24, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + MOV c12, c11 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + + MOV c13, c11 + MOV c14, c11 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MOV c21, c11 + MOV c22, c11 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + + MOV c23, c11 + MOV c24, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F13, F12, 2) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F16, F15, 3) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 4) # unroll k=2 + gsLQC1(R13, F5, F4, 4) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + gsLQC1(R12, F3, F2, 5) + gsLQC1(R13, F7, F6, 5) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + gsLQC1(R12, F9, F8, 6) # Unroll K=3 + gsLQC1(R13, F13, F12, 6) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F16, F15, 7) + gsLQC1(R12, F11, F10, 7) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + daddu PREB, PREB, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + dsra L, K, 2 # Unroll K 4 times + move BO, B + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F13, F12, 2) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F16, F15, 3) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + NOP + + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F5, F4, 4) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + gsLQC1(R13, F7, F6, 5) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + NOP + + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F13, F12, 6) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + gsLQC1(R13, F16, F15, 7) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + daddiu PREB, PREB, 16 * SIZE + + MADD1 c31, c31, a7, b7 # A1xB2 + MADD3 c33, c33, a7, b8 + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + NOP + + bgtz L, .L36 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F11, F10, 3) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + gsLQC1(R12, F1, F0, 4) # Unroll K=2 + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + gsLQC1(R13, F13, F12, 2) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + gsLQC1(R12, F3, F2, 5) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + gsLQC1(R12, F9, F8, 6) # Unroll K=3 + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + gsLQC1(R13, F16, F15, 3) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + gsLQC1(R12, F11, F10, 7) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + daddiu PREA, PREA, 16 * SIZE + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + + gsLQC1(R13, F13, F12, 2) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + daddiu L, L, -1 + + gsLQC1(R12, F11, F10, 3) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + + gsLQC1(R13, F16, F15, 3) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MADD2 c12, c12, a8, b7 # bxc + bgtz L, .L42 + MADD4 c14, c14, a8, b8 # bxd + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c new file mode 100644 index 000000000..3b1b6f73b --- /dev/null +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -0,0 +1,139 @@ +#include "common.h" + +//typedef int BLASLONG; +//typedef double FLOAT; + +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#if !defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_0 +#define spec_loop spec_loop_0 +#define norm_loop_alpha1 norm_loop_alpha1_0 +#define norm_loop norm_loop_0 +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_1 +#define spec_loop spec_loop_1 +#define norm_loop_alpha1 norm_loop_alpha1_1 +#define norm_loop norm_loop_1 +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_2 +#define spec_loop spec_loop_2 +#define norm_loop_alpha1 norm_loop_alpha1_2 +#define norm_loop norm_loop_2 +#endif + +#if defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_3 +#define spec_loop spec_loop_3 +#define norm_loop_alpha1 norm_loop_alpha1_3 +#define norm_loop norm_loop_3 +#endif + +#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + BLASLONG fahead = 60; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj = 0; + + if(rALPHA == 1 && iALPHA == 0) { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c new file mode 100644 index 000000000..3af44caf2 --- /dev/null +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -0,0 +1,125 @@ +#include "common.h" + +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#if !defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_0 +#define spec_loop spec_loop_0 +#define norm_loop_alpha1 norm_loop_alpha1_0 +#define norm_loop norm_loop_0 +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_1 +#define spec_loop spec_loop_1 +#define norm_loop_alpha1 norm_loop_alpha1_1 +#define norm_loop norm_loop_1 +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_2 +#define spec_loop spec_loop_2 +#define norm_loop_alpha1 norm_loop_alpha1_2 +#define norm_loop norm_loop_2 +#endif + +#if defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_3 +#define spec_loop spec_loop_3 +#define norm_loop_alpha1 norm_loop_alpha1_3 +#define norm_loop norm_loop_3 +#endif + + +#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) + +#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) + +#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj = 0; + + if(rALPHA == 1 && iALPHA == 0) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/param.h b/param.h index 603caab46..4ffe05cf8 100644 --- a/param.h +++ b/param.h @@ -1480,31 +1480,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 32 -#define DGEMM_DEFAULT_P 32 -#define CGEMM_DEFAULT_P 108 -#define ZGEMM_DEFAULT_P 112 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_Q 116 -#define DGEMM_DEFAULT_Q 116 -#define CGEMM_DEFAULT_Q 144 -#define ZGEMM_DEFAULT_Q 72 +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 -#define SGEMM_DEFAULT_R 1000 -#define DGEMM_DEFAULT_R 1000 -#define CGEMM_DEFAULT_R 2000 -#define ZGEMM_DEFAULT_R 2000 +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 80 + +#define SGEMM_DEFAULT_R 1024 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_R 1024 + +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 #define SYMV_P 16 #endif diff --git a/test/cblat3.f b/test/cblat3.f index b26be91e6..5df1ddd64 100644 --- a/test/cblat3.f +++ b/test/cblat3.f @@ -1301,6 +1301,8 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) diff --git a/test/zblat3.f b/test/zblat3.f index d6a522f2a..f03b1a617 100644 --- a/test/zblat3.f +++ b/test/zblat3.f @@ -1303,6 +1303,8 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN )