From 1c96d345e22efcc68df697584282d8cc00361e18 Mon Sep 17 00:00:00 2001 From: traz Date: Tue, 21 Jun 2011 22:16:23 +0000 Subject: [PATCH 01/30] Improve zgemm performance from 1G to 1.8G, change block size in param.h. --- kernel/mips64/KERNEL.LOONGSON3A | 6 + kernel/mips64/zgemm_kernel_loongson3a.S | 923 ++++++++++++++++++++++++ param.h | 13 +- 3 files changed, 936 insertions(+), 6 deletions(-) create mode 100644 kernel/mips64/zgemm_kernel_loongson3a.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index e72ac142e..94c8b1b9a 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -13,6 +13,12 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +ZGEMMKERNEL = zgemm_kernel_loongson3a.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S new file mode 100644 index 000000000..0b0d73137 --- /dev/null +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -0,0 +1,923 @@ +#define ASSEMBLER +#include "common.h" + + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + + dsra J, N, 1 # J=N/2 + ST ALPHA_I, 136($sp) + + dsll PREB, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + blez J, .L20 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + .align 5 +.L10: + daddiu J, J, -1 + move CO1, C # Fix pointer Cx + + daddu CO2, C, LDC + move AO, A # Reset AO + + dsra I, M, 1 # I=M/2 + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: + dsra L, K, 2 # Unroll K 4 times + move BO, B + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c21, c11 + MOV c22, c11 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(PREA) # LOAD 32 Byte 4 double + daddu PREB, PREB, B # PREA=A+panel size + + FETCH $0, 0 * SIZE(CO1) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO2) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c41, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c42, c11 + MOV c43, c11 + + FETCH $0, 4 * SIZE(CO2) + blez L, .L15 + MOV c44, c11 + + .align 5 + +.L12: + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F13, F12, 2) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F11, F10, 3) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + gsLQC1(R13, F16, F15, 3) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 4) # Unroll K=2 + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + gsLQC1(R13, F5, F4, 4) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + gsLQC1(R12, F3, F2, 5) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + gsLQC1(R13, F7, F6, 5) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + gsLQC1(R12, F9, F8, 6) # Unroll K=3 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + daddiu L, L, -1 + + gsLQC1(R13, F13, F12, 6) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F11, F10, 7) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + gsLQC1(R13, F16, F15, 7) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddu PREA, PREA, 16 * SIZE + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREB, PREB, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: + andi L, K, 3 + LD ALPHA_R, 128($sp) + NOP + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu L, L, -1 + daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + bgtz L, .L16 + NOP + +.L18: + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + ST a1, 0 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST a2, 1 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b1, 2 * SIZE(CO1) + + ST b2, 3 * SIZE(CO1) + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + blez I, .L19 + daddu C, C, LDC # Change C to next panel + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + MOV c12, c11 + MOV c13, c11 + MOV c14, c11 + + MOV c31, c11 + MOV c32, c11 + MOV c33, c11 + MOV c34, c11 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + blez L, .L35 + NOP + + .align 3 + +.L32: + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F13, F12, 2) + gsLQC1(R13, F16, F15, 3) + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F5, F4, 4) + gsLQC1(R13, F7, F6, 5) + + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + + daddiu L, L, -1 + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F13, F12, 6) + gsLQC1(R13, F16, F15, 7) + + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + MADD1 c31, c31, a7, b7 # A1xB2 + MADD3 c33, c33, a7, b8 + MADD2 c32, c32, a8, b7 + MADD4 c34, c34, a8, b8 + + bgtz L, .L32 + NOP + + .align 3 + +.L35: + andi L, K, 3 + LD ALPHA_R, 128($sp) + LD ALPHA_I, 136($sp) + blez L, .L38 + NOP + .align 3 + +.L36: + daddiu L, L, -1 + daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + bgtz L, .L36 + NOP + +.L38: + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + LD a3, 0 * SIZE(CO2) + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + + .align 3 + +.L19: + bgtz J, .L10 + move B, BO + + .align 3 + +.L20: + andi J, N, 1 + blez J, .L999 + NOP + + move CO1, C + move AO, A # Reset AO + + dsra I, M, 1 # I=M/2 + blez I, .L29 + NOP + +.L21: + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + MOV c12, c11 + MOV c13, c11 + MOV c14, c11 + + MOV c21, c11 + MOV c22, c11 + MOV c23, c11 + MOV c24, c11 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + blez L, .L25 + NOP + + .align 3 + +.L22: + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + gsLQC1(R12, F1, F0, 4) # Unroll K=2 + gsLQC1(R12, F3, F2, 5) + gsLQC1(R13, F13, F12, 2) + + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + + + daddiu L, L, -1 + gsLQC1(R12, F9, F8, 6) # Unroll K=3 + gsLQC1(R12, F11, F10, 7) + gsLQC1(R13, F16, F15, 3) + + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + MADD2 c22, c22, a8, b7 + MADD4 c24, c24, a8, b8 + + bgtz L, .L22 + NOP + + .align 3 + +.L25: + andi L, K, 3 + LD ALPHA_R, 128($sp) + LD ALPHA_I, 136($sp) + blez L, .L28 + NOP + .align 3 + +.L26: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + bgtz L, .L26 + NOP + +.L28: + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + LD b1, 2 * SIZE(CO1) + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + MOV c12, c11 + MOV c13, c11 + MOV c14, c11 + + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + blez L, .L45 + NOP + + .align 3 + +.L42: + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F13, F12, 2) + + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + + daddiu L, L, -1 + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F16, F15, 3) + + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + .align 3 + +.L45: + andi L, K, 3 + LD ALPHA_R, 128($sp) + LD ALPHA_I, 136($sp) + blez L, .L48 + NOP + .align 3 + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + bgtz L, .L46 + NOP + +.L48: + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + daddiu CO1,CO1, 2 * SIZE + + + + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/param.h b/param.h index 603caab46..b7f0d662a 100644 --- a/param.h +++ b/param.h @@ -1488,23 +1488,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 32 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 108 -#define ZGEMM_DEFAULT_P 112 +#define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116 #define CGEMM_DEFAULT_Q 144 -#define ZGEMM_DEFAULT_Q 72 +#define ZGEMM_DEFAULT_Q 60 #define SGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000 #define CGEMM_DEFAULT_R 2000 -#define ZGEMM_DEFAULT_R 2000 +#define ZGEMM_DEFAULT_R 1000 #define SYMV_P 16 #endif From 14f81da375232998e8c1f149ab61db43bfb300af Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 23 Jun 2011 10:46:58 +0000 Subject: [PATCH 02/30] Change prefetch length of A and B, the performance is 2.1G now. --- kernel/mips64/zgemm_kernel_loongson3a.S | 373 +++++++++++++----------- 1 file changed, 207 insertions(+), 166 deletions(-) diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S index 0b0d73137..49603675a 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -6,6 +6,7 @@ #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + #define STACKSIZE 160 #define M $4 #define N $5 @@ -109,12 +110,18 @@ #define ALPHA_R $f15 #define ALPHA_I $f16 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +####if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB -#endif +###endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD @@ -166,25 +173,28 @@ sdc1 $f23,112($sp) #endif - dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i - dsra J, N, 1 # J=N/2 + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 ST ALPHA_I, 136($sp) - dsll PREB, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 - blez J, .L20 - dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 .align 5 .L10: daddiu J, J, -1 - move CO1, C # Fix pointer Cx - - daddu CO2, C, LDC - move AO, A # Reset AO - dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size @@ -192,41 +202,32 @@ dsra L, K, 2 # Unroll K 4 times move BO, B - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 - - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c23, c11 MOV c24, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - FETCH $0, 0 * SIZE(PREA) # LOAD 32 Byte 4 double - daddu PREB, PREB, B # PREA=A+panel size - - FETCH $0, 0 * SIZE(CO1) MOV c31, c11 MOV c32, c11 - - FETCH $0, 0 * SIZE(CO2) + MOV c33, c11 MOV c34, c11 - - FETCH $0, 0 * SIZE(PREB) + MOV c41, c11 - - FETCH $0, 4 * SIZE(CO1) MOV c42, c11 + MOV c43, c11 - - FETCH $0, 4 * SIZE(CO2) blez L, .L15 MOV c44, c11 @@ -234,26 +235,26 @@ .L12: gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F13, F12, 2) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - gsLQC1(R13, F13, F12, 2) + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - gsLQC1(R13, F16, F15, 3) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -262,27 +263,27 @@ MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 - gsLQC1(R12, F1, F0, 4) # Unroll K=2 + gsLQC1(R12, F1, F0, 4) # unroll k=2 + gsLQC1(R13, F5, F4, 4) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - gsLQC1(R13, F5, F4, 4) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd gsLQC1(R12, F3, F2, 5) + gsLQC1(R13, F7, F6, 5) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 - gsLQC1(R13, F7, F6, 5) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 - FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 @@ -292,61 +293,61 @@ MADD4 c44, c44, a8, b8 gsLQC1(R12, F9, F8, 6) # Unroll K=3 + gsLQC1(R13, F13, F12, 6) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - daddiu L, L, -1 - gsLQC1(R13, F13, F12, 6) + gsLQC1(R13, F16, F15, 7) + gsLQC1(R12, F11, F10, 7) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx - gsLQC1(R13, F16, F15, 7) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx FETCH $0, 12 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + daddiu L, L, -1 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 - daddu PREA, PREA, 16 * SIZE MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE daddu PREB, PREB, 16 * SIZE MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 - FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 @@ -362,46 +363,52 @@ .L15: andi L, K, 3 LD ALPHA_R, 128($sp) - NOP blez L, .L18 LD ALPHA_I, 136($sp) .align 5 .L16: - daddiu L, L, -1 - daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx - + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 + FETCH $0, 0 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 + FETCH $0, 0 * SIZE(PREB) MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - bgtz L, .L16 NOP .L18: + ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -426,170 +433,196 @@ MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 - MADD a3, a3, ALPHA_R, c31 - MADD a4, a4, ALPHA_R, c32 - MADD b3, b3, ALPHA_R, c41 - MADD b4, b4, ALPHA_R, c42 - NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 - ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 - ST b1, 2 * SIZE(CO1) - ST b2, 3 * SIZE(CO1) + ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO1) + FETCH $0, 12 * SIZE(CO2) + FETCH $0, 12 * SIZE(CO1) + FETCH $0, 16 * SIZE(CO2) + FETCH $0, 16 * SIZE(CO1) + daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE + .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size blez I, .L19 daddu C, C, LDC # Change C to next panel dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c31, c11 MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + MOV c33, c11 + blez L, .L35 MOV c34, c11 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - - blez L, .L35 - NOP - - .align 3 + .align 5 .L32: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F13, F12, 2) - gsLQC1(R13, F16, F15, 3) - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + NOP MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 + NOP gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R13, F5, F4, 4) - gsLQC1(R13, F7, F6, 5) - MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd + + gsLQC1(R13, F7, F6, 5) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd + NOP MADD1 c31, c31, a3, b7 # A1xB2 MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a4, b7 MADD4 c34, c34, a4, b8 - daddiu L, L, -1 + gsLQC1(R12, F11, F10, 3) gsLQC1(R13, F13, F12, 6) - gsLQC1(R13, F16, F15, 7) - MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd + + gsLQC1(R13, F16, F15, 7) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd - - daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx MADD1 c31, c31, a5, b3 # A1xB2 MADD3 c33, c33, a5, b4 + + FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a6, b3 MADD4 c34, c34, a6, b4 + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a8, b5 # bxc MADD4 c14, c14, a8, b6 # bxd + daddiu PREB, PREB, 16 * SIZE MADD1 c31, c31, a7, b7 # A1xB2 MADD3 c33, c33, a7, b8 + + FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a8, b7 + bgtz L, .L32 MADD4 c34, c34, a8, b8 - bgtz L, .L32 - NOP - - .align 3 .L35: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) - blez L, .L38 NOP - .align 3 + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 .L36: - daddiu L, L, -1 - daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + NOP bgtz L, .L36 - NOP + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 .L38: ADD c11, c14, c11 - ADD c12, c13, c12 - - ADD c31, c34, c31 - ADD c32, c33, c32 - LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) + ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) MADD a1, a1, ALPHA_R, c11 @@ -613,43 +646,48 @@ daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE - .align 3 + .align 5 .L19: bgtz J, .L10 move B, BO - .align 3 + .align 5 .L20: andi J, N, 1 blez J, .L999 - NOP - - move CO1, C - move AO, A # Reset AO + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 dsra I, M, 1 # I=M/2 + move CO1, C + + move AO, A # Reset AO blez I, .L29 - NOP + daddu PREA, PREA, A .L21: dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) blez L, .L25 NOP @@ -658,110 +696,116 @@ .L22: gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 gsLQC1(R12, F1, F0, 4) # Unroll K=2 - gsLQC1(R12, F3, F2, 5) - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd + + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd + gsLQC1(R12, F3, F2, 5) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) MADD2 c22, c22, a8, b3 MADD4 c24, c24, a8, b4 - - daddiu L, L, -1 - gsLQC1(R12, F9, F8, 6) # Unroll K=3 - gsLQC1(R12, F11, F10, 7) - gsLQC1(R13, F16, F15, 3) + gsLQC1(R12, F9, F8, 6) # Unroll K=3 MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd + + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd - daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx - daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx - + gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd + daddiu PREA, PREA, 16 * SIZE + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a6, b7 # bxc MADD4 c14, c14, a6, b8 # bxd + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c21, c21, a7, b7 # A2xB1 MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) MADD2 c22, c22, a8, b7 + bgtz L, .L22 MADD4 c24, c24, a8, b8 - bgtz L, .L22 - NOP - - .align 3 .L25: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) + blez L, .L28 - NOP + LD ALPHA_I, 136($sp) .align 3 .L26: - daddiu L, L, -1 - daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx + daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 bgtz L, .L26 - NOP + FETCH $0, 0 * SIZE(PREA) .L28: ADD c11, c14, c11 - ADD c12, c13, c12 - ADD c21, c24, c21 - ADD c22, c23, c22 - LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) daddiu I, I, -1 @@ -792,15 +836,16 @@ dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 - - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP @@ -808,53 +853,49 @@ .L42: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd + + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd - daddiu L, L, -1 - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F16, F15, 3) + gsLQC1(R12, F11, F10, 3) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - MADD2 c12, c12, a6, b5 # bxc - MADD4 c14, c14, a6, b6 # bxd - - daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F16, F15, 3) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a8, b7 # bxc + bgtz L, .L42 MADD4 c14, c14, a8, b8 # bxd - bgtz L, .L42 - NOP - .align 3 + .align 5 .L45: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) blez L, .L48 - NOP - .align 3 + LD ALPHA_I, 136($sp) .L46: daddiu L, L, -1 @@ -892,7 +933,7 @@ - .align 3 + .align 5 .L999: LDARG $16, 0($sp) From e72113f06a33c7e8dfc799fa1edf3f85f5dd6fc1 Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 23 Jun 2011 21:11:00 +0000 Subject: [PATCH 03/30] Add ztrmm and ztrsm part on loongson3a. The average performance is 2.2G. --- kernel/mips64/KERNEL | 11 + kernel/mips64/KERNEL.LOONGSON3A | 9 + kernel/mips64/zgemm_kernel_loongson3a.S | 441 ++++++++++++++++++++++-- param.h | 2 +- 4 files changed, 438 insertions(+), 25 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index ebb447b11..a14b1cb38 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -128,10 +128,21 @@ CTRSMKERNEL_LT = ztrsm_kernel_LT.S CTRSMKERNEL_RN = ztrsm_kernel_LT.S CTRSMKERNEL_RT = ztrsm_kernel_RT.S +ifndef ZTRSMKERNEL_LN ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_LT ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RN ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RT ZTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 94c8b1b9a..706f48128 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -28,3 +28,12 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S index 49603675a..13022f698 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -1,12 +1,10 @@ #define ASSEMBLER #include "common.h" - #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) - #define STACKSIZE 160 #define M $4 #define N $5 @@ -116,12 +114,12 @@ ## MADD3 a*d ## MADD4 d*b ################################## -####if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB -###endif +#endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD @@ -175,6 +173,9 @@ dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE blez J, .L20 @@ -183,6 +184,10 @@ .align 5 .L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + daddiu J, J, -1 dsra I, M, 1 # I=M/2 @@ -193,12 +198,66 @@ daddu CO2, C, LDC move AO, A # Reset AO - daddu PREB, PREB, B # PREA=A+panel size - blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size .L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2 + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, c11 # Clear results regs + MOV c12, c11 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + + MOV c13, c11 + MOV c14, c11 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MOV c21, c11 + MOV c22, c11 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + + MOV c23, c11 + MOV c24, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times move BO, B @@ -218,18 +277,25 @@ MOV c24, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + FETCH $0, 0 * SIZE(CO2) MOV c31, c11 MOV c32, c11 + FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 + FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 + FETCH $0, 4 * SIZE(CO1) MOV c43, c11 + + daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 MOV c44, c11 +#endif .align 5 @@ -361,8 +427,13 @@ .align 5 .L15: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L18 LD ALPHA_I, 136($sp) @@ -408,7 +479,7 @@ NOP .L18: - +#ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -458,20 +529,75 @@ ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) - FETCH $0, 4 * SIZE(CO2) - FETCH $0, 4 * SIZE(CO1) - FETCH $0, 8 * SIZE(CO2) - FETCH $0, 8 * SIZE(CO1) - FETCH $0, 12 * SIZE(CO2) - FETCH $0, 12 * SIZE(CO1) - FETCH $0, 16 * SIZE(CO2) - FETCH $0, 16 * SIZE(CO1) +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE - + .align 5 .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel @@ -480,22 +606,69 @@ blez I, .L19 daddu C, C, LDC # Change C to next panel - dsra L, K, 2 # Unroll K 4 times +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + move BO, B + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MTC $0, c11 # Clear results regs MOV c12, c11 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c13, c11 MOV c14, c11 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + FETCH $0, 0 * SIZE(PREB) MOV c31, c11 MOV c32, c11 + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + dsra L, K, 2 # Unroll K 4 times + move BO, B + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MOV c13, c11 + MOV c14, c11 + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) @@ -504,6 +677,7 @@ MOV c33, c11 blez L, .L35 MOV c34, c11 +#endif .align 5 @@ -582,15 +756,18 @@ .L35: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) - NOP +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L38 LD ALPHA_I, 136($sp) .align 5 .L36: - daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd @@ -615,6 +792,7 @@ gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 .L38: +#ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -645,10 +823,60 @@ daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .align 5 .L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + bgtz J, .L10 move B, BO @@ -662,11 +890,56 @@ dsra I, M, 1 # I=M/2 move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move AO, A # Reset AO blez I, .L29 daddu PREA, PREA, A .L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else dsra L, K, 2 # Unroll K 4 times move BO, B @@ -691,8 +964,9 @@ blez L, .L25 NOP +#endif - .align 3 + .align 5 .L22: gsLQC1(R12, F9, F8, 2) # Unroll K=1 @@ -766,15 +1040,18 @@ .L25: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) - +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L28 LD ALPHA_I, 136($sp) .align 3 .L26: - daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd @@ -799,6 +1076,7 @@ FETCH $0, 0 * SIZE(PREA) .L28: +#ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -824,6 +1102,48 @@ ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP @@ -833,6 +1153,39 @@ blez I, .L999 NOP +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else dsra L, K, 2 # Unroll K 4 times move BO, B @@ -848,6 +1201,7 @@ FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP +#endif .align 3 @@ -892,8 +1246,13 @@ .align 5 .L45: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L48 LD ALPHA_I, 136($sp) @@ -914,6 +1273,7 @@ NOP .L48: +#ifndef TRMMKERNEL ADD c11, c14, c11 ADD c12, c13, c12 @@ -929,7 +1289,40 @@ ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + daddiu CO1,CO1, 2 * SIZE +#endif diff --git a/param.h b/param.h index b7f0d662a..cab3e68dd 100644 --- a/param.h +++ b/param.h @@ -1500,7 +1500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116 #define CGEMM_DEFAULT_Q 144 -#define ZGEMM_DEFAULT_Q 60 +#define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000 From 708d2b625504a67c5385efbff8010ea5c7a2b98e Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 24 Jun 2011 09:27:41 +0000 Subject: [PATCH 04/30] Fix compute error in ztrmm. --- kernel/mips64/zgemm_kernel_loongson3a.S | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S index 13022f698..4cc396614 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -618,28 +618,26 @@ #endif gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - move BO, B - - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MTC $0, c11 # Clear results regs MOV c12, c11 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 - FETCH $0, 0 * SIZE(PREB) + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c31, c11 MOV c32, c11 + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) - MOV c33, c11 - MOV c34, c11 - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) From c8360e3ae5793e9285c74f7e78d33c0b10653a91 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 18 Jul 2011 17:03:38 +0000 Subject: [PATCH 05/30] Complete all the plura single precision functions of level3 on Loongson3a, the performance is 2.3GFlops. --- kernel/mips64/KERNEL | 11 + kernel/mips64/KERNEL.LOONGSON3A | 17 +- kernel/mips64/cgemm_kernel_loongson3a_2x2.S | 1468 +++++++++++++++++ ...gson3a.S => dgemm_kernel_loongson3a_4x4.S} | 0 ...gson3a.S => sgemm_kernel_loongson3a_4x4.S} | 0 ...gson3a.S => zgemm_kernel_loongson3a_2x2.S} | 2 +- param.h | 10 +- 7 files changed, 1499 insertions(+), 9 deletions(-) create mode 100644 kernel/mips64/cgemm_kernel_loongson3a_2x2.S rename kernel/mips64/{gemm_kernel_loongson3a.S => dgemm_kernel_loongson3a_4x4.S} (100%) rename kernel/mips64/{sgemm_kernel_loongson3a.S => sgemm_kernel_loongson3a_4x4.S} (100%) rename kernel/mips64/{zgemm_kernel_loongson3a.S => zgemm_kernel_loongson3a_2x2.S} (100%) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index a14b1cb38..6afb2cf13 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -123,10 +123,21 @@ ifndef DTRSMKERNEL_RT DTRSMKERNEL_RT = trsm_kernel_RT.S endif +ifndef CTRSMKERNEL_LN CTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_LT CTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RN CTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RT CTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif ifndef ZTRSMKERNEL_LN ZTRSMKERNEL_LN = ztrsm_kernel_LT.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 706f48128..ebab8e6ea 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,19 +1,25 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S -SGEMMKERNEL = sgemm_kernel_loongson3a.S +SGEMMKERNEL = sgemm_kernel_loongson3a_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = gemm_kernel_loongson3a.S +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -ZGEMMKERNEL = zgemm_kernel_loongson3a.S +CGEMMKERNEL = cgemm_kernel_loongson3a_2x2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o @@ -29,6 +35,11 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips64/cgemm_kernel_loongson3a_2x2.S b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S new file mode 100644 index 000000000..5ded7aed0 --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S @@ -0,0 +1,1468 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) + + MOV c41, c11 + MOV c42, c11 + FETCH $0, 4 * SIZE(CO2) + + MOV c43, c11 + NOP + FETCH $0, 4 * SIZE(CO1) + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddiu AO, AO, 16 * SIZE + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + daddu PREB, PREB, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + NOP + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD1 c31, c31, a7, b7 # A1xB2 + NOP + + MADD3 c33, c33, a7, b8 + daddiu PREB, PREB, 16 * SIZE + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L36 + NOP + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu PREA, PREA, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + +# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + +# gsLQC1(R13, F13, F12, 2) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + +# gsLQC1(R12, F11, F10, 3) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + daddiu L, L, -1 + +# gsLQC1(R13, F16, F15, 3) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/dgemm_kernel_loongson3a_4x4.S similarity index 100% rename from kernel/mips64/gemm_kernel_loongson3a.S rename to kernel/mips64/dgemm_kernel_loongson3a_4x4.S diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a_4x4.S similarity index 100% rename from kernel/mips64/sgemm_kernel_loongson3a.S rename to kernel/mips64/sgemm_kernel_loongson3a_4x4.S diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S similarity index 100% rename from kernel/mips64/zgemm_kernel_loongson3a.S rename to kernel/mips64/zgemm_kernel_loongson3a_2x2.S index 4cc396614..a8faad2f6 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S @@ -1065,8 +1065,8 @@ daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 diff --git a/param.h b/param.h index cab3e68dd..fd399a96f 100644 --- a/param.h +++ b/param.h @@ -1486,25 +1486,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 32 -#define CGEMM_DEFAULT_P 108 +#define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116 -#define CGEMM_DEFAULT_Q 144 +#define CGEMM_DEFAULT_Q 100 #define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000 -#define CGEMM_DEFAULT_R 2000 +#define CGEMM_DEFAULT_R 1000 #define ZGEMM_DEFAULT_R 1000 #define SYMV_P 16 From 2e8cdd15423a98d5f8b8efd5e4dd66ff9364d343 Mon Sep 17 00:00:00 2001 From: traz Date: Tue, 30 Aug 2011 20:54:19 +0000 Subject: [PATCH 06/30] Using ps instruction. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 632 ++++++++++++++++++++++++++++ 1 file changed, 632 insertions(+) create mode 100644 kernel/mips64/sgemm_kernel_8x4_ps.S diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S new file mode 100644 index 000000000..075957038 --- /dev/null +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -0,0 +1,632 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define FETCH ld +#define STACKSIZE 192 + +##### Parameter registers #### + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#### Pointer A, B, C #### +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define PREA $18 +#define PREB $19 + +#### Used registers #### +#define A1 $f0 +#define A2 $f1 +#define A3 $f2 +#define A4 $f3 +#define A5 $f4 +#define A6 $f5 +#define A7 $f6 +#define A8 $f7 + +#define B1 $f8 +#define B2 $f9 +#define B3 $f10 +#define B4 $f11 +#define B5 $f12 +#define B6 $f13 +#define B7 $f14 +#define B8 $f15 + +#define C11 $f16 +#define C12 $f17 +#define C21 $f18 +#define C22 $f19 +#define C31 $f20 +#define C32 $f21 +#define C41 $f22 +#define C42 $f23 +#define C13 $f24 +#define C14 $f25 +#define C23 $f26 +#define C24 $f27 +#define C33 $f28 +#define C34 $f29 +#define C43 $f30 +#define C44 $f31 + +#define I $2 +#define J $3 +#define L $7 + +#### Alpha register #### +#define ALPHA $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + +#define R12 12 +#define R13 13 + +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + + #.text +#.align 2 +# .globl REALNAME +# .set nomips16 +# .ent REALNAME +# .type REALNAME, @function +#REALNAME: +# .frame $fp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 +# .mask 0x40000000,-8 +# .fmask 0x00000000,0 +# .set noreorder +# .set nomacro + + + PROLOGUE + + daddiu $sp,$sp,-STACKSIZE + sd $fp,184($sp) + move $fp,$sp + + sd $16, 0($fp) + sd $17, 8($fp) + sd $18, 16($fp) + sd $19, 24($fp) + sd $20, 32($fp) + sd $21, 40($fp) + sd $22, 48($fp) + + ST $f24, 56($fp) + ST $f25, 64($fp) + ST $f26, 72($fp) + ST $f27, 80($fp) + ST $f28, 88($fp) + +#if defined(TRMMKERNEL) + sd $23, 96($fp) + sd $24, 104($fp) + sd $25, 112($fp) +#endif + +#ifndef __64BIT__ + ST $f20,120($fp) + ST $f21,128($fp) + ST $f22,136($fp) + ST $f23,144($fp) +#endif + + .align 4 +.L4: + dsra J, N, 2 # NR=4 + dsll LDC, LDC, BASE_SHIFT# LDC*SIZE + + ST ALPHA, 152($fp) # Store alpha + blez J, .L2 + NOP + + +.L48: + dsra I, M, 3 # MR=8 + dsll PREA, K, BASE_SHIFT + + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + daddu CO3, CO2, LDC + + daddu CO4, CO3, LDC + daddu PREA, A, PREA + + blez I, .L44 + daddu C, CO4, LDC + + .align 4 +.L488: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=8 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + FETCH $0, 4 * SIZE(CO3) + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L484 + FETCH $0, 0 * SIZE(CO4) + +.L4880: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + FETCH $0, 0 * SIZE(PREA) + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + FETCH $0, 0 * SIZE(PREB) + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + FETCH $0, 4 * SIZE(PREA) + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + FETCH $0, 4 * SIZE(PREB) + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + FETCH $0, 8 * SIZE(PREA) + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + FETCH $0, 12 * SIZE(PREA) + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + FETCH $0, 16 * SIZE(PREA) + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + FETCH $0, 20 * SIZE(PREA) + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + FETCH $0, 8 * SIZE(PREB) + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + FETCH $0, 12 * SIZE(PREB) + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + FETCH $0, 24 * SIZE(PREA) + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + FETCH $0, 28 * SIZE(PREA) + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + daddiu PREB, PREB, 16 * SIZE + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + bgtz L, .L4880 + MADPS C44, C44, A8, B8 + + .align 4 +.L484: + andi L, K, 4 + blez L, .L482 + NOP + + .align 4 +.L482: + andi L, K, 2 + blez L, .L481 + NOP + + .align 4 +.L481: + andi L, K, 1 + blez L, .L480 + NOP + + .align 4 +.L480: # Write Back + daddiu I, I, -1 + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + + CVTU A3, C23 # A3=C23.upper=c14 + LD B1, 1 * SIZE(CO1) + + CVTU A4, C21 # A4=C21.upper=c24 + LD B2, 1 * SIZE(CO2) + + CVTU A5, C33 # A5=C33.upper=c16 + LD B3, 3 * SIZE(CO1) + + CVTU A6, C31 # A6=C31.upper=c26 + LD B4, 3 * SIZE(CO2) + + CVTU A7, C43 # A7=C43.upper=c18 + LD B5, 5 * SIZE(CO1) + + CVTU A8, C41 # A8=C41.upper=c28 + LD B6, 5 * SIZE(CO2) + + MADD A1, B1, A1, ALPHA # c12 + LD B7, 7 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B1, 7 * SIZE(CO2) + + MADD A3, B3, A3, ALPHA # c14 + LD B2, 0 * SIZE(CO1) + + MADD A4, B4, A4, ALPHA # c24 + LD B3, 0 * SIZE(CO2) + + MADD A5, B5, A5, ALPHA # c16 + LD B4, 2 * SIZE(CO1) + + MADD A6, B6, A6, ALPHA # c26 + LD B5, 2 * SIZE(CO2) + + MADD A7, B7, A7, ALPHA # c18 + LD B6, 4 * SIZE(CO1) + ST A1, 1 * SIZE(CO1) + + MADD A8, B1, A8, ALPHA # c28 + LD B7, 4 * SIZE(CO2) + ST A2, 1 * SIZE(CO2) + + MADD C11, B2, C11, ALPHA # c12 + LD A1, 6 * SIZE(CO1) + ST A3, 3 * SIZE(CO1) + + MADD C13, B3, C13, ALPHA # c22 + LD A2, 6 * SIZE(CO2) + ST A4, 3 * SIZE(CO2) + + MADD C21, B4, C21, ALPHA # c14 + ST A5, 5 * SIZE(CO1) + + MADD C23, B5, C23, ALPHA # c24 + ST A6, 5 * SIZE(CO2) + + MADD C31, B6, C31, ALPHA # c16 + ST A7, 7 * SIZE(CO1) + + MADD C33, B7, C33, ALPHA # c26 + ST A8, 7 * SIZE(CO2) + + MADD C41, A1, C41, ALPHA # c18 + ST C11, 0 * SIZE(CO1) + + MADD C43, A2, C43, ALPHA # c28 + ST C13, 0 * SIZE(CO2) + + ST C21, 2 * SIZE(CO1) + ST C23, 2 * SIZE(CO2) + ST C31, 4 * SIZE(CO1) + ST C33, 4 * SIZE(CO2) + ST C41, 6 * SIZE(CO1) + + CVTU A1, C14 # B1=C12.upper=c42 + ST C43, 6 * SIZE(CO2) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B1, 1 * SIZE(CO3) + + CVTU A3, C24 # B3=C22.upper=c44 + LD B2, 1 * SIZE(CO4) + + CVTU A4, C22 # B4=C24.upper=c34 + LD B3, 3 * SIZE(CO3) + + CVTU A5, C34 # B5=C32.upper=c46 + LD B4, 3 * SIZE(CO4) + + CVTU A6, C32 # B6=C24.upper=c36 + LD B5, 5 * SIZE(CO3) + + CVTU A7, C44 # B7=C42.upper=c48 + LD B6, 5 * SIZE(CO4) + + CVTU A8, C42 # A1=C44.upper=c38 + LD B7, 7 * SIZE(CO3) + + MADD A1, B1, A1, ALPHA # c31 + LD C11, 7 * SIZE(CO4) + + MADD A2, B2, A2, ALPHA + LD C13, 0 * SIZE(CO3) + + MADD A3, B3, A3, ALPHA + LD C21, 0 * SIZE(CO4) + + MADD A4, B4, A4, ALPHA + LD C23, 2 * SIZE(CO3) + + MADD A5, B5, A5, ALPHA + LD C31, 2 * SIZE(CO4) + + MADD A6, B6, A6, ALPHA + LD C33, 4 * SIZE(CO3) + + MADD A7, B7, A7, ALPHA + LD C41, 4 * SIZE(CO4) + ST A1, 1 * SIZE(CO3) + + MADD A8, C11, A8, ALPHA + LD C43, 6 * SIZE(CO3) + ST A2, 1 * SIZE(CO4) + + MADD C12, C13, C12, ALPHA + LD B1, 6 * SIZE(CO4) + ST A3, 3 * SIZE(CO3) + + MADD C14, C21, C14, ALPHA + ST A4, 3 * SIZE(CO4) + + MADD C22, C23, C22, ALPHA + ST A5, 5 * SIZE(CO3) + + MADD C24, C31, C24, ALPHA + ST A6, 5 * SIZE(CO4) + + MADD C32, C33, C32, ALPHA + ST A7, 7 * SIZE(CO3) + + MADD C34, C41, C34, ALPHA + ST A8, 7 * SIZE(CO4) + + MADD C42, C43, C42, ALPHA + ST C12, 0 * SIZE(CO3) + + MADD C44, B1, C44, ALPHA + ST C14, 0 * SIZE(CO4) + + ST C22, 2 * SIZE(CO3) + daddiu CO1, CO1, 8 * SIZE + + ST C24, 2 * SIZE(CO4) + daddiu CO2, CO2, 8 * SIZE + + ST C32, 4 * SIZE(CO3) + ST C34, 4 * SIZE(CO4) + ST C42, 6 * SIZE(CO3) + ST C44, 6 * SIZE(CO4) + + daddiu CO3, CO3, 8 * SIZE + bgtz I, .L488 + daddiu CO4, CO4, 8 * SIZE + +.L44: + +.L40: + daddiu J, J, -1 + move B, BO + + bgtz J, .L48 + NOP + + .align 4 +.L2: # Nr=2 + andi J, N, 2 + blez J, .L1 + NOP + + + + .align 4 +.L1: + andi J, N, 1 + blez J, .L999 + NOP + + + +.L999: + ld $16, 0($fp) + ld $17, 8($fp) + ld $18, 16($fp) + ld $19, 24($fp) + ld $20, 32($fp) + ld $21, 40($fp) + ld $22, 48($fp) + + LD $f24, 56($fp) + LD $f25, 64($fp) + LD $f26, 72($fp) + LD $f27, 80($fp) + LD $f28, 88($fp) + +#if defined(TRMMKERNEL) + ld $23, 96($fp) + ld $24, 104($fp) + ld $25, 112($fp) +#endif + +#ifndef __64BIT__ + LD $f20,120($fp) + LD $f21,128($fp) + LD $f22,136($fp) + LD $f23,144($fp) +#endif + + move $sp,$fp + ld $fp,184($sp) + daddiu $sp,$sp,STACKSIZE + j $31 + nop + + EPILOGUE +# .set macro +# .set reorder +# .end REALNAME +# .size REALNAME, .-REALNAME +#.ident "GCC: (Debian 4.4.6-6) 4.4.6" From cb0214787b361a6e1f8ac0d1a423d4a95b474832 Mon Sep 17 00:00:00 2001 From: traz Date: Tue, 30 Aug 2011 20:57:00 +0000 Subject: [PATCH 07/30] Modify compile options. --- common_mips64.h | 9 ++++++++- kernel/mips64/KERNEL.LOONGSON3A | 10 +++++++--- param.h | 21 +++++++++++++++++---- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/common_mips64.h b/common_mips64.h index acea79011..2aa325bfa 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -170,6 +170,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.s #define CMPLE c.le.s #define CMPLT c.lt.s +#define PLU plu.ps +#define PLL pll.ps +#define PUU puu.ps +#define PUL pul.ps +#define MADPS madd.ps +#define CVTU cvt.s.pu +#define CVTL cvt.s.pl #endif #if defined(__64BIT__) && defined(USE64BITINT) @@ -218,7 +225,7 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 8 << 20) +#define BUFFER_SIZE ( 32 << 20) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index ebab8e6ea..4a195f265 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,9 +1,13 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S -SGEMMKERNEL = sgemm_kernel_loongson3a_4x4.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o diff --git a/param.h b/param.h index fd399a96f..2c3021710 100644 --- a/param.h +++ b/param.h @@ -1480,7 +1480,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 @@ -1497,16 +1497,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 -#define SGEMM_DEFAULT_Q 116 -#define DGEMM_DEFAULT_Q 116 +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 112 #define CGEMM_DEFAULT_Q 100 #define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 1000 +#define SGEMM_DEFAULT_R 1024 +//#define DGEMM_DEFAULT_R 300 +//#define DGEMM_DEFAULT_R 200 +//#define DGEMM_DEFAULT_R 400 +//#define DGEMM_DEFAULT_R 192 #define DGEMM_DEFAULT_R 1000 +//#define DGEMM_DEFAULT_R 160 +//#define DGEMM_DEFAULT_R 270 #define CGEMM_DEFAULT_R 1000 +//#define ZGEMM_DEFAULT_R 1000 #define ZGEMM_DEFAULT_R 1000 +#define GEMM_OFFSET_A1 (DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*SIZE) +//#define GEMM_OFFSET_B1 0x10 +#define GEMM_OFFSET_B1 (DGEMM_DEFAULT_Q*DGEMM_DEFAULT_R*SIZE) +#define GEMM_OFFSET 0x100000 +#define GEMM_OFFSET1 0x40000 + #define SYMV_P 16 #endif From 09f49fa891a7351abdcf6db95a45c6d6780b69e0 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 31 Aug 2011 21:24:03 +0000 Subject: [PATCH 08/30] Using PS instructions to improve the performance of sgemm and it is 4.2Gflops now. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 6041 ++++++++++++++++++++++++++- 1 file changed, 5951 insertions(+), 90 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 075957038..02a0833dd 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -2,13 +2,12 @@ #define ASSEMBLER #include "common.h" +#define FETCH ld +#define STACKSIZE 192 #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) -#define FETCH ld -#define STACKSIZE 192 - ##### Parameter registers #### #define M $4 @@ -115,13 +114,13 @@ #define R16 16 #define R17 17 - #.text -#.align 2 -# .globl REALNAME +# .text +# .align 2 +## .globl gemm # .set nomips16 -# .ent REALNAME -# .type REALNAME, @function -#REALNAME: +# .ent gemm +# .type gemm, @function +#gemm: # .frame $fp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 # .mask 0x40000000,-8 # .fmask 0x00000000,0 @@ -166,11 +165,8 @@ .L4: dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE - - ST ALPHA, 152($fp) # Store alpha blez J, .L2 - NOP - + ST ALPHA, 152($fp) .L48: dsra I, M, 3 # MR=8 @@ -189,9 +185,9 @@ daddu C, CO4, LDC .align 4 -.L488: +.L481: move BO, B # Reset B - dsra L, K, 2 # UnRoll K=8 + dsra L, K, 6 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 @@ -233,10 +229,10 @@ FETCH $0, 0 * SIZE(CO4) PLU B4, B2, B2 - blez L, .L484 - FETCH $0, 0 * SIZE(CO4) + blez L, .L482 + FETCH $0, 4 * SIZE(CO4) -.L4880: +.L4810: daddiu L, L, -1 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -252,21 +248,21 @@ MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 - FETCH $0, 0 * SIZE(PREA) MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 - FETCH $0, 0 * SIZE(PREB) MADPS C14, C14, A1, B4 PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) - FETCH $0, 4 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -285,21 +281,21 @@ MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 - FETCH $0, 4 * SIZE(PREB) + MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - FETCH $0, 8 * SIZE(PREA) MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 - FETCH $0, 12 * SIZE(PREA) MADPS C14, C14, A5, B8 PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 @@ -316,25 +312,25 @@ MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 - FETCH $0, 16 * SIZE(PREA) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR - FETCH $0, 20 * SIZE(PREA) MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 - FETCH $0, 8 * SIZE(PREB) MADPS C14, C14, A1, B4 PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -353,45 +349,4246 @@ MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 - FETCH $0, 12 * SIZE(PREB) + MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE - FETCH $0, 24 * SIZE(PREA) MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 - FETCH $0, 28 * SIZE(PREA) MADPS C14, C14, A5, B8 PLU B3, B1, B1 - daddiu PREB, PREB, 16 * SIZE + FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE - + MADPS C34, C34, A7, B8 - bgtz L, .L4880 MADPS C44, C44, A8, B8 - .align 4 -.L484: - andi L, K, 4 - blez L, .L482 - NOP - + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + bgtz L, .L4810 + MADPS C44, C44, A8, B8 + .align 4 .L482: - andi L, K, 2 - blez L, .L481 + andi L, K, 32 + blez L, .L483 NOP + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + .align 4 -.L481: +.L483: + andi L, K, 16 + blez L, .L484 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L484: + andi L, K, 8 + blez L, .L485 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L485: + andi L, K, 4 + blez L, .L486 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L486: + andi L, K, 2 + blez L, .L487 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 8 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 16 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 8 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + daddiu PREA, PREA, 16 * SIZE + + + .align 4 +.L487: andi L, K, 1 blez L, .L480 - NOP + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 4 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 8 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + .align 4 .L480: # Write Back @@ -417,56 +4614,57 @@ CVTU A8, C41 # A8=C41.upper=c28 LD B6, 5 * SIZE(CO2) - MADD A1, B1, A1, ALPHA # c12 + MADD A1, B1, A1, ALPHA # c12 LD B7, 7 * SIZE(CO1) - MADD A2, B2, A2, ALPHA # c22 + MADD A2, B2, A2, ALPHA # c22 LD B1, 7 * SIZE(CO2) - MADD A3, B3, A3, ALPHA # c14 + MADD A3, B3, A3, ALPHA # c14 LD B2, 0 * SIZE(CO1) - MADD A4, B4, A4, ALPHA # c24 + MADD A4, B4, A4, ALPHA # c24 LD B3, 0 * SIZE(CO2) - MADD A5, B5, A5, ALPHA # c16 + MADD A5, B5, A5, ALPHA # c16 LD B4, 2 * SIZE(CO1) - MADD A6, B6, A6, ALPHA # c26 + MADD A6, B6, A6, ALPHA # c26 LD B5, 2 * SIZE(CO2) - MADD A7, B7, A7, ALPHA # c18 + MADD A7, B7, A7, ALPHA # c18 LD B6, 4 * SIZE(CO1) + + MADD A8, B1, A8, ALPHA # c28 ST A1, 1 * SIZE(CO1) - MADD A8, B1, A8, ALPHA # c28 + MADD C11, B2, C11, ALPHA # c12 LD B7, 4 * SIZE(CO2) + + MADD C13, B3, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) - MADD C11, B2, C11, ALPHA # c12 + MADD C21, B4, C21, ALPHA # c14 LD A1, 6 * SIZE(CO1) + + MADD C23, B5, C23, ALPHA # c24 ST A3, 3 * SIZE(CO1) - MADD C13, B3, C13, ALPHA # c22 + MADD C31, B6, C31, ALPHA # c16 LD A2, 6 * SIZE(CO2) + + MADD C33, B7, C33, ALPHA # c26 ST A4, 3 * SIZE(CO2) - MADD C21, B4, C21, ALPHA # c14 ST A5, 5 * SIZE(CO1) - - MADD C23, B5, C23, ALPHA # c24 ST A6, 5 * SIZE(CO2) - - MADD C31, B6, C31, ALPHA # c16 ST A7, 7 * SIZE(CO1) - - MADD C33, B7, C33, ALPHA # c26 ST A8, 7 * SIZE(CO2) - MADD C41, A1, C41, ALPHA # c18 + MADD C41, A1, C41, ALPHA # c18 ST C11, 0 * SIZE(CO1) - MADD C43, A2, C43, ALPHA # c28 + MADD C43, A2, C43, ALPHA # c28 ST C13, 0 * SIZE(CO2) ST C21, 2 * SIZE(CO1) @@ -499,87 +4697,1327 @@ CVTU A8, C42 # A1=C44.upper=c38 LD B7, 7 * SIZE(CO3) - MADD A1, B1, A1, ALPHA # c31 + MADD A1, B1, A1, ALPHA # c31 LD C11, 7 * SIZE(CO4) - MADD A2, B2, A2, ALPHA + MADD A2, B2, A2, ALPHA LD C13, 0 * SIZE(CO3) - MADD A3, B3, A3, ALPHA + MADD A3, B3, A3, ALPHA LD C21, 0 * SIZE(CO4) - MADD A4, B4, A4, ALPHA + MADD A4, B4, A4, ALPHA LD C23, 2 * SIZE(CO3) - MADD A5, B5, A5, ALPHA + MADD A5, B5, A5, ALPHA LD C31, 2 * SIZE(CO4) - MADD A6, B6, A6, ALPHA + MADD A6, B6, A6, ALPHA LD C33, 4 * SIZE(CO3) - MADD A7, B7, A7, ALPHA + MADD A7, B7, A7, ALPHA LD C41, 4 * SIZE(CO4) + + MADD A8, C11, A8, ALPHA ST A1, 1 * SIZE(CO3) - MADD A8, C11, A8, ALPHA + MADD C12, C13, C12, ALPHA LD C43, 6 * SIZE(CO3) + + MADD C14, C21, C14, ALPHA ST A2, 1 * SIZE(CO4) - MADD C12, C13, C12, ALPHA + MADD C22, C23, C22, ALPHA LD B1, 6 * SIZE(CO4) + + MADD C24, C31, C24, ALPHA ST A3, 3 * SIZE(CO3) - MADD C14, C21, C14, ALPHA + MADD C32, C33, C32, ALPHA ST A4, 3 * SIZE(CO4) - MADD C22, C23, C22, ALPHA + MADD C34, C41, C34, ALPHA ST A5, 5 * SIZE(CO3) - MADD C24, C31, C24, ALPHA + MADD C42, C43, C42, ALPHA ST A6, 5 * SIZE(CO4) - MADD C32, C33, C32, ALPHA ST A7, 7 * SIZE(CO3) + NOP - MADD C34, C41, C34, ALPHA + MADD C44, B1, C44, ALPHA ST A8, 7 * SIZE(CO4) - MADD C42, C43, C42, ALPHA ST C12, 0 * SIZE(CO3) - - MADD C44, B1, C44, ALPHA ST C14, 0 * SIZE(CO4) - ST C22, 2 * SIZE(CO3) - daddiu CO1, CO1, 8 * SIZE - ST C24, 2 * SIZE(CO4) - daddiu CO2, CO2, 8 * SIZE - ST C32, 4 * SIZE(CO3) ST C34, 4 * SIZE(CO4) ST C42, 6 * SIZE(CO3) ST C44, 6 * SIZE(CO4) + daddiu CO1, CO1, 8 * SIZE + daddiu CO2, CO2, 8 * SIZE daddiu CO3, CO3, 8 * SIZE - bgtz I, .L488 + bgtz I, .L481 daddiu CO4, CO4, 8 * SIZE -.L44: + .align 4 +.L44: + andi I, M, 4 # MR=4 + blez I, .L42 + NOP + + .align 4 +.L441: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L442 + NOP + +.L4410: # + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A5, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C21, C21, A6, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C12, C12, A5, B2 + FETCH $0, 8 * SIZE(PREB) + daddiu BO, BO, 16 * SIZE # 4KR*4NR + + MADPS C22, C22, A6, B2 + FETCH $0, 8 * SIZE(PREA) + daddiu AO, AO, 16 * SIZE # 4KR*4MR + + MADPS C13, C13, A5, B3 + MADPS C23, C23, A6, B3 + + MADPS C14, C14, A5, B4 + MADPS C24, C24, A6, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A7, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A8, B5 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C12, C12, A7, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C22, C22, A8, B6 + FETCH $0, 12 * SIZE(PREA) + + MADPS C13, C13, A7, B7 + daddiu PREA, PREA, 16 * SIZE + MADPS C23, C23, A8, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C14, C14, A7, B8 + MADPS C24, C24, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L4410 + PLU B4, B2, B2 + + .align 4 +.L442: + andi L, K, 2 + blez L, .L443 + NOP + + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + daddiu BO, BO, 8 * SIZE # 2KR*4NR + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + daddiu AO, AO, 8 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F1, F0, 0) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + daddiu PREB, PREB, 8 + MADPS C23, C23, A4, B7 + daddiu PREA, PREA, 8 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L443: + andi L, K, 1 + blez L, .L440 + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 1KR*4NR + MADPS C22, C22, A2, B2 + daddiu AO, AO, 4 * SIZE # 1KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + + .align 4 +.L440: + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + CVTU A3, C23 # A3=C23.upper=c14 + LD B3, 3 * SIZE(CO1) + + CVTU A4, C21 # A4=C21.upper=c24 + LD B4, 3 * SIZE(CO2) + + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD A3, B3, A3, ALPHA # c14 + LD B7, 2 * SIZE(CO1) + + MADD A4, B4, A4, ALPHA # c24 + LD B1, 2 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MADD C21, B7, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MADD C23, B1, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C21, 2 * SIZE(CO1) + ST C23, 2 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + CVTU A3, C24 # B3=C22.upper=c44 + LD B3, 3 * SIZE(CO3) + + CVTU A4, C22 # B4=C24.upper=c34 + LD B4, 3 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD A3, B3, A3, ALPHA + LD A7, 2 * SIZE(CO3) + + MADD A4, B4, A4, ALPHA + LD A8, 2 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MADD C22, A7, C22, ALPHA + ST A3, 3 * SIZE(CO3) + + MADD C24, A8, C24, ALPHA + ST A4, 3 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + + .align 4 +.L42: + andi I, M, 2 + blez I, .L41 + NOP + + .align 4 +.L421: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L422 + NOP + +.L4210: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + gsLQC1(R12, F3, F2, 1) # B1 B2 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + daddiu AO, AO, 8 * SIZE # 4KR*2MR + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A3, B1 + gsLQC1(R12, F1, F0, 0) # B3 B4 + + MADPS C12, C12, A3, B2 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C13, C13, A3, B3 + MADPS C14, C14, A3, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A4, B5 + MADPS C12, C12, A4, B6 + gsLQC1(R13, F9, F8, 0) # B3 B4 + + MADPS C13, C13, A4, B7 + MADPS C14, C14, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L4210 + PLU B4, B2, B2 + + .align 4 +.L422: + andi L, K, 2 + blez L, .L423 + NOP + + daddiu AO, AO, 4 * SIZE # 2KR*2MR + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + daddiu BO, BO, 8 * SIZE # 2KR*2MR + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + gsLQC1(R12, F1, F0, 0) + + PLU B3, B1, B1 + PLU B4, B2, B2 + +.L423: + andi L, K, 1 + blez L, .L420 + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 2KR*4NR + daddiu AO, AO, 2 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + .align 4 +.L420: + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + + .align 4 +.L41: + andi I, M, 1 + blez I, .L40 + NOP + + .align 4 +.L411: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD B1, 0 * SIZE(BO) + + MOV C21, C11 + MOV C22, C11 + LD A1, 0 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B2, 1 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B3, 2 * SIZE(BO) + + MOV C13, C11 + MOV C14, C11 + LD B4, 3 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L412 + MOV C44, C11 + +.L4110: + daddiu L, L, -1 + LD A2, 1 * SIZE(AO) + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A3, 2 * SIZE(AO) + NOP + + MADD C11, C11, A2, B5 + LD B1, 8 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 9 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 10 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 11 * SIZE(BO) + + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B1 + LD B5, 12 * SIZE(BO) + + MADD C12, C12, A3, B2 + LD B6, 13 * SIZE(BO) + + MADD C13, C13, A3, B3 + LD B7, 14 * SIZE(BO) + + MADD C14, C14, A3, B4 + LD B8, 15 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD C11, C11, A4, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A4, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A4, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A4, B8 + bgtz L, .L4110 + LD B4, 3 * SIZE(BO) + +.L412: + andi L, K, 2 + blez L, .L413 + NOP + + LD A2, 1 * SIZE(AO) + daddiu AO, AO, 2 * SIZE + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 8 * SIZE + + MADD C11, C11, A2, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 3 * SIZE(BO) + +.L413: + andi L, K, 1 + blez L, .L410 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + MADD C13, C13, A1, B3 + MADD C14, C14, A1, B4 + daddiu BO, BO, 4 * SIZE + + .align 4 +.L410: + LD A5, 0 * SIZE(CO1) + LD A6, 0 * SIZE(CO2) + LD A7, 0 * SIZE(CO3) + LD A8, 0 * SIZE(CO4) + + MADD A5, A5, C11, ALPHA + MADD A6, A6, C12, ALPHA + MADD A7, A7, C13, ALPHA + MADD A8, A8, C14, ALPHA + + ST A5, 0 * SIZE(CO1) + ST A6, 0 * SIZE(CO2) + ST A7, 0 * SIZE(CO3) + ST A8, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + .align 4 .L40: daddiu J, J, -1 move B, BO - bgtz J, .L48 NOP + + .align 4 .L2: # Nr=2 andi J, N, 2 blez J, .L1 NOP +.L28: + dsra I, M, 3 # MR=8 + + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + blez I, .L24 + daddu C, CO2, LDC + + + .align 4 +.L281: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C13, C11 + LD A7, 6 * SIZE(AO) + + MOV C14, C11 + LD A8, 7 * SIZE(AO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L282 + MOV C44, C11 + + + .align 4 +.L2810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + LD B3, 2 * SIZE(BO) + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + LD A2, 13 * SIZE(AO) + + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + LD A3, 14 * SIZE(AO) + + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B3 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B3 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B3 + MADD C23, C23, A2, B3 + LD A7, 6 * SIZE(AO) + + MADD C33, C33, A3, B3 + MADD C43, C43, A4, B3 + LD A8, 7 * SIZE(AO) + + MADD C14, C14, A1, B4 + MADD C24, C24, A2, B4 + LD B1, 0 * SIZE(BO) + + MADD C34, C34, A3, B4 + MADD C44, C44, A4, B4 + LD B2, 1 * SIZE(BO) + + MADD C31, C31, B7, B3 + MADD C41, C41, B8, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, B5, B4 + LD A2, 1 * SIZE(AO) + + MADD C22, C22, B6, B4 + LD A3, 2 * SIZE(AO) + + LD A4, 3 * SIZE(AO) + MADD C32, C32, B7, B4 + bgtz L, .L2810 + MADD C42, C42, B8, B4 + + .align 4 +.L282: + andi L, K, 1 + blez L, .L280 + LD ALPHA, 152($fp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L280: # Write Back + daddiu I, I, -1 + + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + LD A5, 4 * SIZE(CO1) + LD A6, 5 * SIZE(CO1) + LD A7, 6 * SIZE(CO1) + LD A8, 7 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD A5, A5, C13, ALPHA + LD B5, 4 * SIZE(CO2) + + MADD A6, A6, C23, ALPHA + LD B6, 5 * SIZE(CO2) + + MADD A7, A7, C33, ALPHA + LD B7, 6 * SIZE(CO2) + + MADD A8, A8, C43, ALPHA + LD C11, 7 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + MADD B5, B5, C14, ALPHA + ST A5, 4 * SIZE(CO1) + + MADD B6, B6, C24, ALPHA + ST A6, 5 * SIZE(CO1) + + MADD B7, B7, C34, ALPHA + ST A7, 6 * SIZE(CO1) + + MADD C11, C11, C44, ALPHA + ST A8, 7 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + ST B5, 4 * SIZE(CO2) + ST B6, 5 * SIZE(CO2) + ST B7, 6 * SIZE(CO2) + ST C11, 7 * SIZE(CO2) + + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L281 + daddiu CO2, CO2, 8 * SIZE + + + .align 4 +.L24: + andi I, M, 4 # MR=4 + blez I, .L22 + NOP + + .align 4 +.L241: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + LD B2, 1 * SIZE(BO) + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L242 + MOV C44, C11 + + + .align 4 +.L2410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD B4, 3 * SIZE(BO) + + MADD C41, C41, A4, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 4 * SIZE + + MADD C12, C12, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C22, C22, A2, B2 + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD B2, 1 * SIZE(BO) + + MADD C41, C41, A8, B3 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A5, B4 + LD A3, 2 * SIZE(AO) + + MADD C22, C22, A6, B4 + LD A4, 3 * SIZE(AO) + + MADD C32, C32, A7, B4 + bgtz L, .L2410 + MADD C42, C42, A8, B4 + + .align 4 +.L242: + andi L, K, 1 + blez L, .L240 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L240: # Write Back + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + .align 4 +.L22: + andi I, M, 2 + blez I, .L21 + NOP + + .align 4 +.L221: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L222 + MOV C44, C11 + + + .align 4 +.L2210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A3, 2 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C22, C22, A2, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A4, B3 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A3, B4 + LD B2, 1 * SIZE(BO) + + MADD C22, C22, A4, B4 + bgtz L, .L2210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L222: + andi L, K, 1 + blez L, .L220 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L220: # Write Back + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + + .align 4 +.L21: + andi I, M, 1 + blez I, .L20 + NOP + + .align 4 +.L211: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L212 + MOV C44, C11 + + + .align 4 +.L2110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A1, B2 + LD B3, 2 * SIZE(BO) + + LD B4, 3 * SIZE(BO) + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A2, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, A2, B4 + LD B1, 0 * SIZE(BO) + + bgtz L, .L2110 + LD B2, 1 * SIZE(BO) + + + .align 4 +.L212: + andi L, K, 1 + blez L, .L210 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L210: # Write Back + LD A1, 0 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + + + .align 4 +.L20: + move B, BO + NOP + .align 4 @@ -588,6 +6026,429 @@ blez J, .L999 NOP +.L18: + dsra I, M, 3 # MR=8 + move AO, A # Reset A + blez I, .L14 + NOP + + + .align 4 +.L181: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD A7, 6 * SIZE(AO) + + MOV C13, C11 + LD A8, 7 * SIZE(AO) + + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L182 + MOV C44, C11 + + + .align 4 +.L1810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C13, C13, A5, B1 + LD B2, 1 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + LD A2, 13 * SIZE(AO) + + MADD C43, C43, A8, B1 + LD A3, 14 * SIZE(AO) + + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B2 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B2 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C23, C23, A2, B2 + LD A8, 7 * SIZE(AO) + + MADD C33, C33, A3, B2 + LD B1, 0 * SIZE(BO) + + MADD C43, C43, A4, B2 + LD A1, 0 * SIZE(AO) + + MADD C31, C31, B7, B2 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, B8, B2 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1810 + LD A4, 3 * SIZE(AO) + + .align 4 +.L182: + andi L, K, 1 + blez L, .L180 + LD ALPHA, 152($fp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L180: # Write Back + daddiu I, I, -1 + + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + LD A5, 4 * SIZE(C) + LD A6, 5 * SIZE(C) + LD A7, 6 * SIZE(C) + LD A8, 7 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + MADD A5, A5, C13, ALPHA + MADD A6, A6, C23, ALPHA + MADD A7, A7, C33, ALPHA + MADD A8, A8, C43, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + ST A5, 4 * SIZE(C) + ST A6, 5 * SIZE(C) + ST A7, 6 * SIZE(C) + ST A8, 7 * SIZE(C) + + daddiu C, C, 8 * SIZE + bgtz I, .L181 + NOP + + + .align 4 +.L14: + andi I, M, 4 # MR=4 + blez I, .L12 + NOP + + .align 4 +.L141: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L142 + MOV C44, C11 + + + .align 4 +.L1410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 1 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 2 * SIZE + + MADD C41, C41, A4, B1 + LD A7, 6 * SIZE(AO) + + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, A8, B3 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1410 + LD A4, 3 * SIZE(AO) + + .align 4 +.L142: + andi L, K, 1 + blez L, .L140 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L140: # Write Back + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + daddiu C, C, 4 * SIZE + + .align 4 +.L12: + andi I, M, 2 + blez I, .L11 + NOP + + .align 4 +.L121: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + + MOV C43, C11 + blez L, .L122 + MOV C44, C11 + + + .align 4 +.L1210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B3, 1 * SIZE(BO) + + MADD C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + + LD A3, 2 * SIZE(AO) + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD B1, 0 * SIZE(BO) + + MADD C21, C21, A4, B3 + LD A1, 0 * SIZE(AO) + bgtz L, .L1210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L122: + andi L, K, 1 + blez L, .L120 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L120: # Write Back + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + + daddiu C, C, 2 * SIZE + + + .align 4 +.L11: + andi I, M, 1 + blez I, .L10 + NOP + + .align 4 +.L111: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD B1, 0 * SIZE(BO) + + MOV C31, C11 + blez L, .L112 + MOV C32, C11 + + + + .align 4 +.L1110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + + LD A2, 1 * SIZE(AO) + LD B2, 1 * SIZE(BO) + + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + MADD C11, C11, A2, B2 + LD A1, 0 * SIZE(AO) + LD B1, 0 * SIZE(BO) + + bgtz L, .L1110 + NOP + + + .align 4 +.L112: + andi L, K, 1 + blez L, .L110 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L110: # Write Back + LD A1, 0 * SIZE(C) + + MADD A1, A1, C11, ALPHA + + ST A1, 0 * SIZE(C) + + daddiu C, C, 1 * SIZE + + + .align 4 +.L10: + move B, BO + NOP .L999: @@ -627,6 +6488,6 @@ EPILOGUE # .set macro # .set reorder -# .end REALNAME -# .size REALNAME, .-REALNAME -#.ident "GCC: (Debian 4.4.6-6) 4.4.6" +# .end gemm +# .size gemm, .-gemm +# .ident "GCC: (Debian 4.4.6-6) 4.4.6" From 74a3f634890d950ab67c2557232d49440a0d2e1c Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 1 Sep 2011 17:15:28 +0000 Subject: [PATCH 09/30] Tuning mb, kb, nb size to get the best performance. --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 2c3021710..ecdae2e67 100644 --- a/param.h +++ b/param.h @@ -1497,7 +1497,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 -#define SGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 112 #define CGEMM_DEFAULT_Q 100 #define ZGEMM_DEFAULT_Q 80 From a15bc9582485d4f5dab3adf7724488a41352047d Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 09:15:09 +0000 Subject: [PATCH 10/30] Add strmm part. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 1344 ++++++++++++++++++++++++++- 1 file changed, 1327 insertions(+), 17 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 02a0833dd..1b4dae892 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -114,6 +114,12 @@ #define R16 16 #define R17 17 +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + # .text # .align 2 ## .globl gemm @@ -165,6 +171,15 @@ .L4: dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE + +#if defined(TRMMKERNEL) + LD OFFSET, 192($fp) +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + blez J, .L2 ST ALPHA, 152($fp) @@ -181,11 +196,81 @@ daddu CO4, CO3, LDC daddu PREA, A, PREA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + blez I, .L44 daddu C, CO4, LDC .align 4 .L481: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 3 + BASE_SHIFT # kk*8mr*datasize + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L # AO point to the data addr + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + FETCH $0, 4 * SIZE(CO3) + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + FETCH $0, 4 * SIZE(CO4) + +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK # TEMP is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 6 + blez L, .L482 + NOP +#else + # GEMM PART move BO, B # Reset B dsra L, K, 6 # UnRoll K=64 @@ -231,6 +316,7 @@ PLU B4, B2, B2 blez L, .L482 FETCH $0, 4 * SIZE(CO4) +#endif .L4810: daddiu L, L, -1 @@ -2413,7 +2499,11 @@ .align 4 .L482: +#ifndef TRMMKERNEL andi L, K, 32 +#else + andi L, TEMP, 32 +#endif blez L, .L483 NOP @@ -3508,7 +3598,11 @@ .align 4 .L483: +#ifndef TRMMKERNEL andi L, K, 16 +#else + andi L, TEMP, 16 +#endif blez L, .L484 NOP @@ -4059,7 +4153,11 @@ .align 4 .L484: +#ifndef TRMMKERNEL andi L, K, 8 +#else + andi L, TEMP, 8 +#endif blez L, .L485 NOP @@ -4338,7 +4436,11 @@ .align 4 .L485: +#ifndef TRMMKERNEL andi L, K, 4 +#else + andi L, TEMP, 4 +#endif blez L, .L486 NOP @@ -4481,7 +4583,11 @@ .align 4 .L486: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L487 NOP @@ -4558,7 +4664,11 @@ .align 4 .L487: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L480 LD ALPHA, 152($fp) @@ -4592,6 +4702,7 @@ .align 4 .L480: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C13 # A1=C13.upper=c12 CVTU A2, C11 # A2=C11.upper=c22 @@ -4762,7 +4873,141 @@ daddiu CO3, CO3, 8 * SIZE bgtz I, .L481 daddiu CO4, CO4, 8 * SIZE +#else + daddiu I, I, -1 + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + CVTU A3, C23 # A3=C23.upper=c14 + CVTU A4, C21 # A4=C21.upper=c24 + CVTU A5, C33 # A5=C33.upper=c16 + CVTU A6, C31 # A6=C31.upper=c26 + CVTU A7, C43 # A7=C43.upper=c18 + CVTU A8, C41 # A8=C41.upper=c28 + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + MUL A3, A3, ALPHA # c14 + MUL A4, A4, ALPHA # c24 + MUL A5, A5, ALPHA # c16 + MUL A6, A6, ALPHA # c26 + MUL A7, A7, ALPHA # c18 + MUL A8, A8, ALPHA # c28 + + MUL C11, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MUL C13, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MUL C21, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MUL C23, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + MUL C31, C31, ALPHA # c16 + ST A5, 5 * SIZE(CO1) + + MUL C33, C33, ALPHA # c26 + ST A6, 5 * SIZE(CO2) + + MUL C41, C41, ALPHA # c18 + ST A7, 7 * SIZE(CO1) + + MUL C43, C43, ALPHA # c28 + ST A8, 7 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + ST C11, 0 * SIZE(CO1) + + CVTU A2, C12 # B2=C14.upper=c32 + ST C13, 0 * SIZE(CO2) + + CVTU A3, C24 # B3=C22.upper=c44 + ST C21, 2 * SIZE(CO1) + + CVTU A4, C22 # B4=C24.upper=c34 + ST C23, 2 * SIZE(CO2) + + CVTU A5, C34 # B5=C32.upper=c46 + ST C31, 4 * SIZE(CO1) + + CVTU A6, C32 # B6=C24.upper=c36 + ST C33, 4 * SIZE(CO2) + + CVTU A7, C44 # B7=C42.upper=c48 + ST C41, 6 * SIZE(CO1) + + CVTU A8, C42 # A1=C44.upper=c38 + ST C43, 6 * SIZE(CO2) + + MUL A1, A1, ALPHA # c31 + MUL A2, A2, ALPHA + MUL A3, A3, ALPHA + MUL A4, A4, ALPHA + MUL A5, A5, ALPHA + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA + + MUL C12, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MUL C14, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MUL C22, C22, ALPHA + ST A3, 3 * SIZE(CO3) + + MUL C24, C24, ALPHA + ST A4, 3 * SIZE(CO4) + + MUL C32, C32, ALPHA + ST A5, 5 * SIZE(CO3) + + MUL C34, C34, ALPHA + ST A6, 5 * SIZE(CO4) + + MUL C42, C42, ALPHA + ST A7, 7 * SIZE(CO3) + + MUL C44, C44, ALPHA + ST A8, 7 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + ST C32, 4 * SIZE(CO3) + ST C34, 4 * SIZE(CO4) + ST C42, 6 * SIZE(CO3) + ST C44, 6 * SIZE(CO4) + + daddiu CO1, CO1, 8 * SIZE + daddiu CO2, CO2, 8 * SIZE + daddiu CO3, CO3, 8 * SIZE + daddiu CO4, CO4, 8 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + + bgtz I, .L481 +#endif .align 4 .L44: @@ -4772,6 +5017,65 @@ .align 4 .L441: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + PLU B4, B2, B2 + +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddu TEMP, KK, 4 +#else + daddu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L442 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 @@ -4806,13 +5110,12 @@ FETCH $0, 0 * SIZE(CO3) MOV C44, C11 - PLU B3, B1, B1 - FETCH $0, 0 * SIZE(CO4) - PLU B4, B2, B2 + FETCH $0, 0 * SIZE(CO4) blez L, .L442 - NOP + PLU B4, B2, B2 +#endif .L4410: # daddiu L, L, -1 @@ -4907,7 +5210,11 @@ .align 4 .L442: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L443 NOP @@ -4960,7 +5267,11 @@ .align 4 .L443: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L440 LD ALPHA, 152($fp) @@ -4981,6 +5292,7 @@ .align 4 .L440: +#ifndef TRMMKERNEL CVTU A1, C13 # A1=C13.upper=c12 LD B1, 1 * SIZE(CO1) @@ -5069,6 +5381,86 @@ daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE +#else + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + CVTU A3, C23 # A3=C23.upper=c14 + CVTU A4, C21 # A4=C21.upper=c24 + + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + MUL A3, A3, ALPHA # c14 + MUL A4, A4, ALPHA # c24 + + MUL C11, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MUL C13, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MUL C21, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MUL C23, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + CVTU A5, C14 # B1=C12.upper=c42 + ST C11, 0 * SIZE(CO1) + + CVTU A6, C12 # B2=C14.upper=c32 + ST C13, 0 * SIZE(CO2) + + CVTU A7, C24 # B3=C22.upper=c44 + ST C21, 2 * SIZE(CO1) + + CVTU A8, C22 # B4=C24.upper=c34 + ST C23, 2 * SIZE(CO2) + + MUL A5, A5, ALPHA # c31 + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA + + MUL C12, C12, ALPHA + ST A5, 1 * SIZE(CO3) + + MUL C14, C14, ALPHA + ST A6, 1 * SIZE(CO4) + + MUL C22, C22, ALPHA + ST A7, 3 * SIZE(CO3) + + MUL C24, C24, ALPHA + ST A8, 3 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif .align 4 .L42: @@ -5078,6 +5470,62 @@ .align 4 .L421: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L422 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 @@ -5110,13 +5558,12 @@ FETCH $0, 0 * SIZE(CO3) MOV C44, C11 - PLU B3, B1, B1 - FETCH $0, 0 * SIZE(CO4) - PLU B4, B2, B2 + FETCH $0, 0 * SIZE(CO4) blez L, .L422 - NOP + PLU B4, B2, B2 +#endif .L4210: daddiu L, L, -1 @@ -5168,7 +5615,11 @@ .align 4 .L422: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L423 NOP @@ -5196,7 +5647,11 @@ PLU B4, B2, B2 .L423: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L420 LD ALPHA, 152($fp) @@ -5210,6 +5665,7 @@ .align 4 .L420: +#ifndef TRMMKERNEL CVTU A1, C13 # A1=C13.upper=c12 LD B1, 1 * SIZE(CO1) @@ -5256,6 +5712,60 @@ daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE +#else + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + + MUL C11, C11, ALPHA # c12 + MUL C13, C13, ALPHA # c22 + + CVTU A3, C14 # B1=C12.upper=c42 + CVTU A4, C12 # B2=C14.upper=c32 + + MUL A3, A3, ALPHA # c31 + ST A1, 1 * SIZE(CO1) + + MUL A4, A4, ALPHA + ST A2, 1 * SIZE(CO2) + + MUL C12, C12, ALPHA + ST C11, 0 * SIZE(CO1) + + MUL C14, C14, ALPHA + ST C13, 0 * SIZE(CO2) + + ST A3, 1 * SIZE(CO3) + ST A4, 1 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif .align 4 @@ -5266,6 +5776,56 @@ .align 4 .L411: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD B1, 0 * SIZE(BO) + + MOV C21, C11 + MOV C22, C11 + LD A1, 0 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B2, 1 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B3, 2 * SIZE(BO) + + MOV C13, C11 + MOV C14, C11 + LD B4, 3 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA))||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L412 + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 @@ -5298,6 +5858,7 @@ MOV C43, C11 blez L, .L412 MOV C44, C11 +#endif .L4110: daddiu L, L, -1 @@ -5362,7 +5923,11 @@ LD B4, 3 * SIZE(BO) .L412: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L413 NOP @@ -5397,7 +5962,11 @@ LD B4, 3 * SIZE(BO) .L413: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L410 LD ALPHA, 152($fp) @@ -5410,6 +5979,7 @@ .align 4 .L410: +#ifndef TRMMKERNEL LD A5, 0 * SIZE(CO1) LD A6, 0 * SIZE(CO2) LD A7, 0 * SIZE(CO3) @@ -5429,9 +5999,47 @@ daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE +#else + MUL A5, C11, ALPHA + MUL A6, C12, ALPHA + MUL A7, C13, ALPHA + MUL A8, C14, ALPHA + + ST A5, 0 * SIZE(CO1) + ST A6, 0 * SIZE(CO2) + ST A7, 0 * SIZE(CO3) + ST A8, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .align 4 .L40: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 4 +#endif daddiu J, J, -1 move B, BO bgtz J, .L48 @@ -5451,13 +6059,75 @@ move AO, A # Reset A move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif daddu CO2, C, LDC blez I, .L24 daddu C, CO2, LDC - .align 4 .L281: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 3 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C13, C11 + LD A7, 6 * SIZE(AO) + + MOV C14, C11 + LD A8, 7 * SIZE(AO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L282 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5500,7 +6170,7 @@ MOV C43, C11 blez L, .L282 MOV C44, C11 - +#endif .align 4 .L2810: @@ -5582,7 +6252,11 @@ .align 4 .L282: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L280 LD ALPHA, 152($fp) @@ -5609,6 +6283,7 @@ .align 4 .L280: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 LD A1, 0 * SIZE(CO1) @@ -5680,6 +6355,72 @@ daddiu CO1, CO1, 8 * SIZE bgtz I, .L281 daddiu CO2, CO2, 8 * SIZE +#else + daddiu I, I, -1 + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + MUL A5, C13, ALPHA + MUL A6, C23, ALPHA + MUL A7, C33, ALPHA + MUL A8, C43, ALPHA + + MUL B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MUL B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MUL B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MUL B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + MUL B5, C14, ALPHA + ST A5, 4 * SIZE(CO1) + + MUL B6, C24, ALPHA + ST A6, 5 * SIZE(CO1) + + MUL B7, C34, ALPHA + ST A7, 6 * SIZE(CO1) + + MUL C11, C44, ALPHA + ST A8, 7 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + ST B5, 4 * SIZE(CO2) + ST B6, 5 * SIZE(CO2) + ST B7, 6 * SIZE(CO2) + ST C11, 7 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L281 + daddiu CO2, CO2, 8 * SIZE +#endif .align 4 @@ -5690,6 +6431,58 @@ .align 4 .L241: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + LD B2, 1 * SIZE(BO) + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L242 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L242 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5723,7 +6516,7 @@ MOV C43, C11 blez L, .L242 MOV C44, C11 - +#endif .align 4 .L2410: @@ -5775,7 +6568,11 @@ .align 4 .L242: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L240 LD ALPHA, 152($fp) @@ -5793,6 +6590,7 @@ .align 4 .L240: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) LD A2, 1 * SIZE(CO1) LD A3, 2 * SIZE(CO1) @@ -5829,6 +6627,50 @@ daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE +#else + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + + MUL B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MUL B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MUL B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MUL B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif .align 4 .L22: @@ -5838,6 +6680,46 @@ .align 4 .L221: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L222 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5860,6 +6742,7 @@ MOV C43, C11 blez L, .L222 MOV C44, C11 +#endif .align 4 @@ -5895,7 +6778,11 @@ .align 4 .L222: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L220 LD ALPHA, 152($fp) @@ -5909,6 +6796,7 @@ .align 4 .L220: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) LD A2, 1 * SIZE(CO1) @@ -5929,7 +6817,39 @@ daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL B1, C12, ALPHA + MUL B2, C22, ALPHA + + ST A1, 0 * SIZE(CO1) + ST A2, 1 * SIZE(CO1) + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddu KK, KK, 2 +#endif +#endif .align 4 .L21: @@ -5939,6 +6859,46 @@ .align 4 .L211: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L212 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5960,7 +6920,7 @@ MOV C43, C11 blez L, .L212 MOV C44, C11 - +#endif .align 4 .L2110: @@ -5987,7 +6947,11 @@ .align 4 .L212: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L210 LD ALPHA, 152($fp) @@ -5999,6 +6963,7 @@ .align 4 .L210: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) MADD A1, A1, C11, ALPHA @@ -6011,12 +6976,42 @@ daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE +#else + + MUL A1, C11, ALPHA + MUL B1, C12, ALPHA + + ST A1, 0 * SIZE(CO1) + ST B1, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, 1 +#else + daddiu TEMP, TEMP, 2 +#endif + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .align 4 .L20: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif move B, BO - NOP @@ -6029,12 +7024,76 @@ .L18: dsra I, M, 3 # MR=8 move AO, A # Reset A + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif blez I, .L14 NOP .align 4 .L181: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 3 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD A7, 6 * SIZE(AO) + + MOV C13, C11 + LD A8, 7 * SIZE(AO) + + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L182 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6076,6 +7135,7 @@ MOV C43, C11 blez L, .L182 MOV C44, C11 +#endif .align 4 @@ -6138,7 +7198,11 @@ .align 4 .L182: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L180 LD ALPHA, 152($fp) @@ -6157,6 +7221,7 @@ .align 4 .L180: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 LD A1, 0 * SIZE(C) @@ -6189,7 +7254,51 @@ daddiu C, C, 8 * SIZE bgtz I, .L181 NOP +#else + daddiu I, I, -1 + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + MUL A5, C13, ALPHA + MUL A6, C23, ALPHA + MUL A7, C33, ALPHA + MUL A8, C43, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + ST A5, 4 * SIZE(C) + ST A6, 5 * SIZE(C) + ST A7, 6 * SIZE(C) + ST A8, 7 * SIZE(C) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK + +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + + daddiu C, C, 8 * SIZE + bgtz I, .L181 + NOP +#endif .align 4 .L14: @@ -6199,6 +7308,56 @@ .align 4 .L141: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L142 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6231,7 +7390,7 @@ MOV C43, C11 blez L, .L142 MOV C44, C11 - +#endif .align 4 .L1410: @@ -6270,7 +7429,11 @@ .align 4 .L142: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L140 LD ALPHA, 152($fp) @@ -6284,6 +7447,7 @@ .align 4 .L140: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(C) LD A2, 1 * SIZE(C) LD A3, 2 * SIZE(C) @@ -6299,6 +7463,36 @@ ST A3, 2 * SIZE(C) ST A4, 3 * SIZE(C) daddiu C, C, 4 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + daddiu C, C, 4 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif .align 4 .L12: @@ -6308,6 +7502,48 @@ .align 4 .L121: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L122 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6329,7 +7565,7 @@ MOV C43, C11 blez L, .L122 MOV C44, C11 - +#endif .align 4 .L1210: @@ -6355,7 +7591,11 @@ .align 4 .L122: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L120 LD ALPHA, 152($fp) @@ -6367,6 +7607,7 @@ .align 4 .L120: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(C) LD A2, 1 * SIZE(C) @@ -6377,7 +7618,33 @@ ST A2, 1 * SIZE(C) daddiu C, C, 2 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + + daddiu C, C, 2 * SIZE +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif .align 4 .L11: @@ -6387,6 +7654,38 @@ .align 4 .L111: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, BASE_SHIFT + daddu AO, AO, L + daddu BO, B, L +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD B1, 0 * SIZE(BO) + + MOV C31, C11 + MOV C32, C11 +#if (defined(LEFT) && !defined(TRANSA))||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L112 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6401,7 +7700,7 @@ MOV C31, C11 blez L, .L112 MOV C32, C11 - +#endif .align 4 @@ -6425,7 +7724,11 @@ .align 4 .L112: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L110 LD ALPHA, 152($fp) @@ -6436,6 +7739,7 @@ .align 4 .L110: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(C) MADD A1, A1, C11, ALPHA @@ -6443,14 +7747,20 @@ ST A1, 0 * SIZE(C) daddiu C, C, 1 * SIZE +#else + MUL A1, C11, ALPHA + + ST A1, 0 * SIZE(C) + daddiu C, C, 1 * SIZE + +#endif .align 4 .L10: move B, BO NOP - .L999: ld $16, 0($fp) ld $17, 8($fp) From 23e182ca7c7cbf3dae151d3d084c074078b075fa Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 15:28:01 +0000 Subject: [PATCH 11/30] Fix stack-pointer bug for strmm. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 116 +++++++++++++--------------- 1 file changed, 55 insertions(+), 61 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 1b4dae892..93002547b 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -3,7 +3,7 @@ #include "common.h" #define FETCH ld -#define STACKSIZE 192 +#define STACKSIZE 160 #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) @@ -127,7 +127,7 @@ # .ent gemm # .type gemm, @function #gemm: -# .frame $fp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 +# .frame $sp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 # .mask 0x40000000,-8 # .fmask 0x00000000,0 # .set noreorder @@ -137,34 +137,34 @@ PROLOGUE daddiu $sp,$sp,-STACKSIZE - sd $fp,184($sp) - move $fp,$sp - sd $16, 0($fp) - sd $17, 8($fp) - sd $18, 16($fp) - sd $19, 24($fp) - sd $20, 32($fp) - sd $21, 40($fp) - sd $22, 48($fp) + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) - ST $f24, 56($fp) - ST $f25, 64($fp) - ST $f26, 72($fp) - ST $f27, 80($fp) - ST $f28, 88($fp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) #if defined(TRMMKERNEL) - sd $23, 96($fp) - sd $24, 104($fp) - sd $25, 112($fp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + + LDARG OFFSET, 160($sp) #endif #ifndef __64BIT__ - ST $f20,120($fp) - ST $f21,128($fp) - ST $f22,136($fp) - ST $f23,144($fp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) #endif .align 4 @@ -172,16 +172,12 @@ dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE -#if defined(TRMMKERNEL) - LD OFFSET, 192($fp) -#endif - #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif blez J, .L2 - ST ALPHA, 152($fp) + ST ALPHA, 152($sp) .L48: dsra I, M, 3 # MR=8 @@ -4670,7 +4666,7 @@ andi L, TEMP, 1 #endif blez L, .L480 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -5273,7 +5269,7 @@ andi L, TEMP, 1 #endif blez L, .L440 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -5653,7 +5649,7 @@ andi L, TEMP, 1 #endif blez L, .L420 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C12, C12, A1, B2 @@ -5968,7 +5964,7 @@ andi L, TEMP, 1 #endif blez L, .L410 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C12, C12, A1, B2 @@ -6258,7 +6254,7 @@ andi L, TEMP, 1 #endif blez L, .L280 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C13, C13, A5, B1 MADD C23, C23, A6, B1 @@ -6574,7 +6570,7 @@ andi L, TEMP, 1 #endif blez L, .L240 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -6784,7 +6780,7 @@ andi L, TEMP, 1 #endif blez L, .L220 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -6953,7 +6949,7 @@ andi L, TEMP, 1 #endif blez L, .L210 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C12, C12, A1, B2 @@ -7204,7 +7200,7 @@ andi L, TEMP, 1 #endif blez L, .L180 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C13, C13, A5, B1 MADD C23, C23, A6, B1 @@ -7435,7 +7431,7 @@ andi L, TEMP, 1 #endif blez L, .L140 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -7597,7 +7593,7 @@ andi L, TEMP, 1 #endif blez L, .L120 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -7730,7 +7726,7 @@ andi L, TEMP, 1 #endif blez L, .L110 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 daddiu AO, AO, 1 * SIZE @@ -7762,35 +7758,33 @@ NOP .L999: - ld $16, 0($fp) - ld $17, 8($fp) - ld $18, 16($fp) - ld $19, 24($fp) - ld $20, 32($fp) - ld $21, 40($fp) - ld $22, 48($fp) + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) - LD $f24, 56($fp) - LD $f25, 64($fp) - LD $f26, 72($fp) - LD $f27, 80($fp) - LD $f28, 88($fp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) #if defined(TRMMKERNEL) - ld $23, 96($fp) - ld $24, 104($fp) - ld $25, 112($fp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) #endif #ifndef __64BIT__ - LD $f20,120($fp) - LD $f21,128($fp) - LD $f22,136($fp) - LD $f23,144($fp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) #endif - move $sp,$fp - ld $fp,184($sp) daddiu $sp,$sp,STACKSIZE j $31 nop From a059c553a11ae1ae944161975075326661593a86 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 16:00:04 +0000 Subject: [PATCH 12/30] Fix a compute error for strmm. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 93002547b..2da94e5aa 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -6985,9 +6985,9 @@ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT - daddiu TEMP, TEMP, 1 + daddiu TEMP, TEMP, -1 #else - daddiu TEMP, TEMP, 2 + daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT From 3274ff47b854bff0b0c5e66b24e50cddbafc7dca Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 16:50:50 +0000 Subject: [PATCH 13/30] Fix an error for strmm_LLTN. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 2da94e5aa..6191196f7 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -6465,7 +6465,6 @@ MOV C34, C11 MOV C43, C11 - blez L, .L242 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK From 790614683656cddf8cf4fcbe2933274012dd3314 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 16:57:33 +0000 Subject: [PATCH 14/30] Fix an error for strmm_LLTN. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 6191196f7..efe62384c 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -6069,7 +6069,7 @@ move BO, B #else dsll L, KK, 3 + BASE_SHIFT - dsll TEMP, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP From 74d4cdb81a59393feee3affeb777d9724a5b6ff0 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 19:41:06 +0000 Subject: [PATCH 15/30] Fix an illegal instruction for strmm_RTLU. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index efe62384c..bc81d0eb5 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -5003,6 +5003,7 @@ #endif bgtz I, .L481 + NOP #endif .align 4 From 4727fe8abfd6fa93bb78347f535bfa86d75263d5 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 5 Sep 2011 15:13:05 +0000 Subject: [PATCH 16/30] Refs #47. On Loongson 3A, set DGEMM_R parameter depending on different number of threads. It would improve double precision BLAS3 on multi-threads. --- common_macro.h | 4 +++- driver/others/blas_server.c | 5 +++++ driver/others/blas_server_omp.c | 5 +++++ driver/others/memory.c | 2 +- driver/others/parameter.c | 28 ++++++++++++++++++++++++++++ param.h | 3 ++- 6 files changed, 44 insertions(+), 3 deletions(-) diff --git a/common_macro.h b/common_macro.h index bcaa9f38b..0c34ecb01 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2127,7 +2127,9 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) +extern BLASLONG gemm_offset_a; +extern BLASLONG gemm_offset_b; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c0f77c4c9..a026ccb26 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -797,6 +797,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4fd4cd440..c45856fd9 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/memory.c b/driver/others/memory.c index dd8334477..ac9c87850 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 9e72fd24f..80f708452 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -45,8 +45,22 @@ int get_L2_size(void); #define DEFAULT_GEMM_P 128 #define DEFAULT_GEMM_Q 128 #define DEFAULT_GEMM_R 128 +#define DEFAULT_GEMM_OFFSET_A 0 +#define DEFAULT_GEMM_OFFSET_B 0 /* Global Parameter */ +#if GEMM_OFFSET_A == gemm_offset_a +BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; +#else +BLASLONG gemm_offset_a = GEMM_OFFSET_A; +#endif + +#if GEMM_OFFSET_B == gemm_offset_b +BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; +#else +BLASLONG gemm_offset_b = GEMM_OFFSET_B; +#endif + #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -666,3 +680,17 @@ void blas_set_parameter(void){ #endif #endif + +#if defined(ARCH_MIPS64) +void blas_set_parameter(void){ +#if defined(LOONGSON3A) + if(blas_num_threads == 1){ + //single thread + dgemm_r = 1000; + }else{ + //multi thread + dgemm_r = 300; + } +#endif +} +#endif diff --git a/param.h b/param.h index ecdae2e67..52a132049 100644 --- a/param.h +++ b/param.h @@ -1507,7 +1507,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //#define DGEMM_DEFAULT_R 200 //#define DGEMM_DEFAULT_R 400 //#define DGEMM_DEFAULT_R 192 -#define DGEMM_DEFAULT_R 1000 +#define DGEMM_DEFAULT_R dgemm_r +//1000 //#define DGEMM_DEFAULT_R 160 //#define DGEMM_DEFAULT_R 270 #define CGEMM_DEFAULT_R 1000 From 64fa709d1f2b758a5bcea3f32f2bb50ddae97e30 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 5 Sep 2011 16:30:55 +0000 Subject: [PATCH 17/30] Fixed #46. Initialize variables in cblat3.f and zblat3.f. --- test/cblat3.f | 2 ++ test/zblat3.f | 2 ++ 2 files changed, 4 insertions(+) diff --git a/test/cblat3.f b/test/cblat3.f index b26be91e6..5df1ddd64 100644 --- a/test/cblat3.f +++ b/test/cblat3.f @@ -1301,6 +1301,8 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) diff --git a/test/zblat3.f b/test/zblat3.f index d6a522f2a..f03b1a617 100644 --- a/test/zblat3.f +++ b/test/zblat3.f @@ -1303,6 +1303,8 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) From 3c856c0c1a7f8484e87dd564af8b84427baea27b Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 6 Sep 2011 18:27:33 +0000 Subject: [PATCH 18/30] Check the return value of pthread_create. Update the docs with known issue on Loongson 3A. --- README | 1 + driver/others/blas_server.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/README b/README index 248741544..b67db1169 100644 --- a/README +++ b/README @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve 9.Known Issues * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index a026ccb26..66067a05c 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ /* Initializing routine */ int blas_thread_init(void){ BLASLONG i; + int ret; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif @@ -545,12 +546,16 @@ int blas_thread_init(void){ pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR - pthread_create(&blas_threads[i], &attr, + ret=pthread_create(&blas_threads[i], &attr, (void *)&blas_thread_server, (void *)i); #else - pthread_create(&blas_threads[i], NULL, + ret=pthread_create(&blas_threads[i], NULL, (void *)&blas_thread_server, (void *)i); #endif + if(ret!=0){ + fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); + exit(1); + } } #ifdef MONITOR From 16fc083322eefd9e309b412e26db6fca62496afc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 8 Sep 2011 16:39:34 +0000 Subject: [PATCH 19/30] Refs #47. Fixed the seting parameter bug on Loongson 3A single thread version. --- driver/others/parameter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 80f708452..4a8542a93 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -684,13 +684,17 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ #if defined(LOONGSON3A) +#ifdef SMP if(blas_num_threads == 1){ +#endif //single thread dgemm_r = 1000; +#ifdef SMP }else{ //multi thread dgemm_r = 300; } #endif +#endif } #endif From d238a768abac572235cbe19db179587ebfc54545 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 14 Sep 2011 15:32:25 +0000 Subject: [PATCH 20/30] Use ps instructions in cgemm. --- kernel/mips64/KERNEL.LOONGSON3A | 6 +- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 921 ++++++++++++++++++ param.h | 6 +- 3 files changed, 929 insertions(+), 4 deletions(-) create mode 100644 kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 4a195f265..91f2e7dd1 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -17,9 +17,13 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = cgemm_kernel_loongson3a_2x2.S +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S new file mode 100644 index 000000000..67d2333cb --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -0,0 +1,921 @@ +##define REALNAME gemm +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define STACKSIZE 192 +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +##### Parameter registers #### +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#### Pointer A, B, C #### +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 + +#define PREA $18 +#define PREB $19 + +#### Used registers #### +#define A1 $f0 +#define A2 $f1 +#define A3 $f2 +#define A4 $f3 +#define A5 $f4 +#define A6 $f5 +#define A7 $f6 +#define A8 $f7 + +#define B1 $f8 +#define B2 $f9 +#define B3 $f10 +#define B4 $f11 +#define B5 $f12 +#define B6 $f13 +#define B7 $f14 +#define B8 $f15 + +#define C11 $f16 +#define C12 $f17 +#define C21 $f18 +#define C22 $f19 +#define C31 $f20 +#define C32 $f21 +#define C41 $f22 +#define C42 $f23 +#define C13 $f24 +#define C14 $f25 +#define C23 $f26 +#define C24 $f27 +#define C33 $f28 +#define C34 $f29 +#define C43 $f30 +#define C44 $f31 + +#define I $2 +#define J $3 +#define L $7 + +#### Alpha register #### +#define ALPHA $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + +#define R12 12 +#define R13 13 + +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + + + PROLOGUE + + daddiu $sp,$sp,-STACKSIZE + + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + +#if defined(TRMMKERNEL) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + + LDARG OFFSET, 160($sp) +#endif + +#ifndef __64BIT__ + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) +#endif + + .align 4 +.L2: + dsra J, N, 1 # NR=2 + ST $f15, 152($sp) + + dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE + blez J, .L1 + ST $f16, 160($sp) + +.L24: + dsra I, M, 2 # MR=8 + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + blez I, .L22 + daddu C, CO2, LDC + + .align 4 +.L241: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + MOV C43, C11 + PLU B3, B1, B1 + + MOV C44, C11 + blez L, .L242 + PLU B4, B2, B2 + +.L2410: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 4) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 5) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 6) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 7) # A7 A8 + MADPS C31, C31, A3, B1 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + bgtz L, .L2410 + MADPS C44, C44, A8, B8 + + + .align 4 +.L242: + andi L, K, 2 + blez L, .L247 + NOP + + .align 4 +.L247: + andi L, K, 1 + blez L, .L240 + NOP + + + .align 4 +.L240: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + SUB C41, C41, A4 + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + + ADD C31, A3, C31 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + ADD C41, A4, C41 + LD B1, 0 * SIZE(CO1) + + SUB C13, A5, C13 # ad'+'cb + LD B3, 2 * SIZE(CO1) + + SUB C23, A6, C23 + LD B5, 4 * SIZE(CO1) + + SUB C33, A7, C33 + LD B7, 6 * SIZE(CO1) + + SUB C43, A8, C43 + LD B2, 1 * SIZE(CO1) + + ADD C12, B1, C12 + LD B4, 3 * SIZE(CO1) + + ADD C22, B2, C22 + LD B6, 5 * SIZE(CO1) + + ADD C32, B3, C32 + LD B8, 7 * SIZE(CO1) + + ADD C42, B4, C42 + MADD B1, B1, C11, A1 # A1 = alpha_r + + SUB C14, B5, C14 + MADD B3, B3, C21, A1 + + SUB C24, B6, C24 + MADD B5, B5, C31, A1 + + SUB C34, B7, C34 + MADD B7, B7, C41, A1 + + SUB C44, B8, C44 + MADD B2, B2, C13, A1 + + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + LD C13, 0 * SIZE(CO2) + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + LD C23, 2 * SIZE(CO2) + + MADD B4, B4, C12, A2 + MADD B6, B6, C13, A2 + LD C33, 4 * SIZE(CO2) + + MADD B8, B8, C14, A2 + LD C43, 6 * SIZE(CO2) + + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + MADD C13, C13, C12, A1 + + LD C41, 7 * SIZE(CO2) + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + + ADD C31, A3, C31 +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + ADD C41, A4, C41 + LD B1, 0 * SIZE(CO1) + + SUB C13, C13, A5 # ad'+'cb + LD B3, 2 * SIZE(CO1) + + SUB C23, C23, A6 + LD B5, 4 * SIZE(CO1) + + SUB C33, C33, A7 + LD B7, 6 * SIZE(CO1) + + SUB C43, C43, A8 + LD B2, 1 * SIZE(CO1) + + ADD C12, B1, C12 + LD B4, 3 * SIZE(CO1) + + ADD C22, B2, C22 + LD B6, 5 * SIZE(CO1) + + ADD C32, B3, C32 + LD B8, 7 * SIZE(CO1) + + ADD C42, B4, C42 + MADD B1, B1, C11, A1 # A1 = alpha_r + + SUB C14, C14, B5 + MADD B3, B3, C21, A1 + + SUB C24, C24, B6 + MADD B5, B5, C31, A1 + + SUB C34, C34, B7 + MADD B7, B7, C41, A1 + + SUB C44, C44, B8 + MADD B2, B2, C13, A1 + + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + LD C13, 0 * SIZE(CO2) + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + LD C23, 2 * SIZE(CO2) + + MADD B4, B4, C12, A2 + MADD B6, B6, C13, A2 + LD C33, 4 * SIZE(CO2) + + MADD B8, B8, C14, A2 + LD C43, 6 * SIZE(CO2) + + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + MADD C13, C13, C12, A1 + + LD C41, 7 * SIZE(CO2) + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, A1, C11 # ac'+'bd + SUB C21, A2, C21 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + + SUB C31, A3, C31 +# LD A2, 0 * SIZE(A) # load alpha_i + LD A2, 160($sp) + + SUB C41, A4, C41 + LD B1, 0 * SIZE(CO1) + + ADD C13, A5, C13 # ad'+'cb + LD B3, 2 * SIZE(CO1) + + ADD C23, A6, C23 + LD B5, 4 * SIZE(CO1) + + ADD C33, A7, C33 + LD B7, 6 * SIZE(CO1) + + ADD C43, A8, C43 + LD B2, 1 * SIZE(CO1) + + SUB C12, B1, C12 + LD B4, 3 * SIZE(CO1) + + SUB C22, B2, C22 + LD B6, 5 * SIZE(CO1) + + SUB C32, B3, C32 + LD B8, 7 * SIZE(CO1) + + SUB C42, B4, C42 + MADD B1, B1, C11, A1 # A1 = alpha_r + + ADD C14, B5, C14 + MADD B3, B3, C21, A1 + + ADD C24, B6, C24 + MADD B5, B5, C31, A1 + + ADD C34, B7, C34 + MADD B7, B7, C41, A1 + + ADD C44, B8, C44 + NMSUB B2, B2, C13, A1 + + NMSUB B4, B4, C23, A1 + NMSUB B6, B6, C33, A1 + + NMSUB B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + LD C13, 0 * SIZE(CO2) + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + LD C23, 2 * SIZE(CO2) + + MADD B4, B4, C12, A2 + MADD B6, B6, C13, A2 + LD C33, 4 * SIZE(CO2) + + MADD B8, B8, C14, A2 + LD C43, 6 * SIZE(CO2) + + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + MADD C13, C13, C12, A1 + + LD C41, 7 * SIZE(CO2) + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + NMSUB C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + NMSUB C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + NMSUB C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + NMSUB C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L241 + daddiu CO2, CO2, 8 * SIZE + + .align 4 +.L22: + andi I, M, 2 # MR=4 + blez I, .L21 + NOP + + .align 4 +.L21: + andi I, M, 1 + blez I, .L20 + NOP + + .align 4 +.L20: + daddiu J, J, -1 + move B, BO + bgtz J, .L24 + NOP + + + .align 4 +.L1: + andi J, N, 1 + blez J, .L999 + NOP + + .align 4 +.L10: + move B, BO + +.L999: + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + +#if defined(TRMMKERNEL) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) +#endif + +#ifndef __64BIT__ + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) +#endif + + daddiu $sp,$sp,STACKSIZE + j $31 + nop + + EPILOGUE diff --git a/param.h b/param.h index 52a132049..1c729e8b9 100644 --- a/param.h +++ b/param.h @@ -1486,7 +1486,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 @@ -1499,7 +1499,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 112 -#define CGEMM_DEFAULT_Q 100 +#define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1024 @@ -1511,7 +1511,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //1000 //#define DGEMM_DEFAULT_R 160 //#define DGEMM_DEFAULT_R 270 -#define CGEMM_DEFAULT_R 1000 +#define CGEMM_DEFAULT_R 1024 //#define ZGEMM_DEFAULT_R 1000 #define ZGEMM_DEFAULT_R 1000 From 9679dd077e59407860dfa82e11d4f7ba07468496 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 14 Sep 2011 20:00:35 +0000 Subject: [PATCH 21/30] Fix some compute error. --- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 219 +++++++----------- 1 file changed, 82 insertions(+), 137 deletions(-) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 67d2333cb..7371ba280 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -12,10 +12,10 @@ #define M $4 #define N $5 #define K $6 -#define A $8 -#define B $9 -#define C $10 -#define LDC $11 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 #### Pointer A, B, C #### #define AO $12 @@ -120,6 +120,7 @@ PROLOGUE + LDARG LDC, 0($sp) daddiu $sp,$sp,-STACKSIZE sd $16, 0($sp) @@ -141,7 +142,7 @@ sd $24, 104($sp) sd $25, 112($sp) - LDARG OFFSET, 160($sp) + LDARG OFFSET, STACKSIZE($sp) #endif #ifndef __64BIT__ @@ -379,13 +380,12 @@ /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 - LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i - - SUB C41, C41, A4 ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 ADD C33, A7, C33 @@ -488,78 +488,60 @@ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r - LD A1, 152($sp) # load alpha_r - ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r - - ADD C41, A4, C41 - LD B1, 0 * SIZE(CO1) - SUB C13, A5, C13 # ad'+'cb - LD B3, 2 * SIZE(CO1) - SUB C23, A6, C23 - LD B5, 4 * SIZE(CO1) - SUB C33, A7, C33 - LD B7, 6 * SIZE(CO1) - SUB C43, A8, C43 - LD B2, 1 * SIZE(CO1) - ADD C12, B1, C12 - LD B4, 3 * SIZE(CO1) - ADD C22, B2, C22 - LD B6, 5 * SIZE(CO1) - ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, B5, C14 + SUB C24, B6, C24 + SUB C34, B7, C34 + SUB C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) - ADD C42, B4, C42 MADD B1, B1, C11, A1 # A1 = alpha_r - - SUB C14, B5, C14 MADD B3, B3, C21, A1 - - SUB C24, B6, C24 MADD B5, B5, C31, A1 - - SUB C34, B7, C34 MADD B7, B7, C41, A1 - - SUB C44, B8, C44 MADD B2, B2, C13, A1 - MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 - MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - LD C13, 0 * SIZE(CO2) - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - LD C23, 2 * SIZE(CO2) - MADD B4, B4, C12, A2 MADD B6, B6, C13, A2 - LD C33, 4 * SIZE(CO2) - MADD B8, B8, C14, A2 - LD C43, 6 * SIZE(CO2) + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) - MADD C13, C13, C12, A1 - LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 MADD C23, C23, C22, A1 MADD C33, C33, C32, A1 @@ -611,78 +593,60 @@ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r - LD A1, 152($sp) # load alpha_r - ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r - LD A2, 160($sp) # load alpha_i - ADD C41, A4, C41 - LD B1, 0 * SIZE(CO1) - + LD A2, 160($sp) # load alpha_i SUB C13, C13, A5 # ad'+'cb - LD B3, 2 * SIZE(CO1) - SUB C23, C23, A6 - LD B5, 4 * SIZE(CO1) - SUB C33, C33, A7 - LD B7, 6 * SIZE(CO1) - SUB C43, C43, A8 - LD B2, 1 * SIZE(CO1) - ADD C12, B1, C12 - LD B4, 3 * SIZE(CO1) - ADD C22, B2, C22 - LD B6, 5 * SIZE(CO1) - ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, C14, B5 + SUB C24, C24, B6 + SUB C34, C34, B7 + SUB C44, C44, B8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) - ADD C42, B4, C42 MADD B1, B1, C11, A1 # A1 = alpha_r - - SUB C14, C14, B5 MADD B3, B3, C21, A1 - - SUB C24, C24, B6 MADD B5, B5, C31, A1 - - SUB C34, C34, B7 MADD B7, B7, C41, A1 - - SUB C44, C44, B8 MADD B2, B2, C13, A1 - MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 - MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - LD C13, 0 * SIZE(CO2) - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - LD C23, 2 * SIZE(CO2) - MADD B4, B4, C12, A2 MADD B6, B6, C13, A2 - LD C33, 4 * SIZE(CO2) - MADD B8, B8, C14, A2 - LD C43, 6 * SIZE(CO2) + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) - MADD C13, C13, C12, A1 - LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 MADD C23, C23, C22, A1 MADD C33, C33, C32, A1 @@ -731,113 +695,94 @@ #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ - SUB C11, A1, C11 # ac'+'bd - SUB C21, A2, C21 + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r - - SUB C31, A3, C31 -# LD A2, 0 * SIZE(A) # load alpha_i + SUB C41, C41, A4 LD A2, 160($sp) - - SUB C41, A4, C41 - LD B1, 0 * SIZE(CO1) +# LD A2, 0 * SIZE(A) # load alpha_i ADD C13, A5, C13 # ad'+'cb - LD B3, 2 * SIZE(CO1) - ADD C23, A6, C23 - LD B5, 4 * SIZE(CO1) - ADD C33, A7, C33 - LD B7, 6 * SIZE(CO1) - ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) - - SUB C12, B1, C12 LD B4, 3 * SIZE(CO1) - - SUB C22, B2, C22 LD B6, 5 * SIZE(CO1) - - SUB C32, B3, C32 LD B8, 7 * SIZE(CO1) - SUB C42, B4, C42 MADD B1, B1, C11, A1 # A1 = alpha_r - - ADD C14, B5, C14 MADD B3, B3, C21, A1 - - ADD C24, B6, C24 MADD B5, B5, C31, A1 - - ADD C34, B7, C34 MADD B7, B7, C41, A1 - - ADD C44, B8, C44 NMSUB B2, B2, C13, A1 - NMSUB B4, B4, C23, A1 NMSUB B6, B6, C33, A1 - NMSUB B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - LD C13, 0 * SIZE(CO2) - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - LD C23, 2 * SIZE(CO2) - MADD B4, B4, C12, A2 MADD B6, B6, C13, A2 - LD C33, 4 * SIZE(CO2) - MADD B8, B8, C14, A2 - LD C43, 6 * SIZE(CO2) + LD C13, 0 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) - MADD C13, C13, C12, A1 - LD C41, 7 * SIZE(CO2) - MADD C23, C23, C22, A1 - MADD C33, C33, C32, A1 + MADD C13, C13, C12, A1 ST B1, 0 * SIZE(CO1) - MADD C43, C43, C42, A1 + MADD C23, C23, C22, A1 ST B3, 2 * SIZE(CO1) - NMSUB C11, C11, C14, A1 + MADD C33, C33, C32, A1 ST B5, 4 * SIZE(CO1) - NMSUB C21, C21, C24, A1 + MADD C43, C43, C42, A1 ST B7, 6 * SIZE(CO1) - NMSUB C31, C31, C34, A1 + NMSUB C11, C11, C14, A1 ST B2, 1 * SIZE(CO1) - NMSUB C41, C41, C44, A1 + NMSUB C21, C21, C24, A1 ST B4, 3 * SIZE(CO1) - NMSUB C13, C13, C14, A2 + NMSUB C31, C31, C34, A1 ST B6, 5 * SIZE(CO1) - NMSUB C23, C23, C24, A2 + NMSUB C41, C41, C44, A1 ST B8, 7 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 - MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 From 7fa3d23dd91ba6aaae6f77f210f338ba55422e49 Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 15 Sep 2011 16:08:23 +0000 Subject: [PATCH 22/30] Complete cgemm function, but no optimization. --- common_mips64.h | 2 + .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 1689 ++++++++++++++++- 2 files changed, 1652 insertions(+), 39 deletions(-) diff --git a/common_mips64.h b/common_mips64.h index 2aa325bfa..35d8265bc 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -152,6 +152,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.d #define CMPLE c.le.d #define CMPLT c.lt.d +#define NEG neg.d #else #define LD lwc1 #define ST swc1 @@ -177,6 +178,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define MADPS madd.ps #define CVTU cvt.s.pu #define CVTL cvt.s.pl +#define NEG neg.s #endif #if defined(__64BIT__) && defined(USE64BITINT) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 7371ba280..b57213a24 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -185,9 +185,9 @@ MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 + gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C41, C11 MOV C42, C11 - gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 @@ -195,20 +195,21 @@ MOV C23, C11 FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) MOV C24, C11 - FETCH $0, 4 * SIZE(CO1) MOV C33, C11 FETCH $0, 0 * SIZE(CO2) - MOV C34, C11 - FETCH $0, 4 * SIZE(CO2) - - MOV C43, C11 - PLU B3, B1, B1 - MOV C44, C11 - blez L, .L242 + FETCH $0, 8 * SIZE(CO2) + MOV C34, C11 + MOV C43, C11 + + PLU B3, B1, B1 PLU B4, B2, B2 + blez L, .L242 + MOV C44, C11 .L2410: daddiu L, L, -1 @@ -234,9 +235,9 @@ MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 - PLU B7, B5, B5 - MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 PLU B8, B6, B6 MADPS C34, C34, A3, B4 @@ -264,9 +265,9 @@ MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 - PLU B3, B1, B1 - MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 PLU B4, B2, B2 MADPS C34, C34, A7, B8 @@ -282,12 +283,12 @@ gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C31, C31, A3, B1 - daddiu BO, BO, 16 * SIZE # 4KR*4NR + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR MADPS C41, C41, A4, B1 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 - daddiu AO, AO, 32 * SIZE # 4KR*8MR + daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 @@ -296,9 +297,9 @@ MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 - PLU B7, B5, B5 - MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 PLU B8, B6, B6 MADPS C34, C34, A3, B4 @@ -326,9 +327,9 @@ MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 - PLU B3, B1, B1 - MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 PLU B4, B2, B2 MADPS C34, C34, A7, B8 @@ -342,12 +343,100 @@ blez L, .L247 NOP + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 4 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + .align 4 .L247: andi L, K, 1 blez L, .L240 NOP + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu BO, BO, 1 * 4 * SIZE # 4KR*4NR + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 2 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + .align 4 .L240: # Write Back @@ -417,13 +506,10 @@ MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 @@ -528,9 +614,9 @@ NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C12, A2 - MADD B6, B6, C13, A2 - MADD B8, B8, C14, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C23, 2 * SIZE(CO2) @@ -633,9 +719,9 @@ NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C12, A2 - MADD B6, B6, C13, A2 - MADD B8, B8, C14, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C23, 2 * SIZE(CO2) @@ -716,6 +802,14 @@ ADD C24, B6, C24 ADD C34, B7, C34 ADD C44, B8, C44 + NEG C13, C13 + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + NEG C14, C14 + NEG C24, C24 + NEG C34, C34 + NEG C44, C44 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) @@ -730,18 +824,18 @@ MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 - NMSUB B2, B2, C13, A1 - NMSUB B4, B4, C23, A1 - NMSUB B6, B6, C33, A1 - NMSUB B8, B8, C43, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C12, A2 - MADD B6, B6, C13, A2 - MADD B8, B8, C14, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C43, 6 * SIZE(CO2) @@ -764,16 +858,16 @@ MADD C43, C43, C42, A1 ST B7, 6 * SIZE(CO1) - NMSUB C11, C11, C14, A1 + MADD C11, C11, C14, A1 ST B2, 1 * SIZE(CO1) - NMSUB C21, C21, C24, A1 + MADD C21, C21, C24, A1 ST B4, 3 * SIZE(CO1) - NMSUB C31, C31, C34, A1 + MADD C31, C31, C34, A1 ST B6, 5 * SIZE(CO1) - NMSUB C41, C41, C44, A1 + MADD C41, C41, C44, A1 ST B8, 7 * SIZE(CO1) NMSUB C13, C13, C14, A2 @@ -807,12 +901,700 @@ blez I, .L21 NOP + .align 4 +.L221: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) + + PLU B3, B1, B1 + blez L, .L222 + PLU B4, B2, B2 + +.L2210: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A3, B5 + MADPS C21, C21, A4, B5 + + MADPS C12, C12, A3, B6 + MADPS C22, C22, A4, B6 + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + PLU B3, B1, B1 + PLU B4, B2, B2 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A5, B1 + MADPS C21, C21, A6, B1 + + MADPS C12, C12, A5, B2 + MADPS C22, C22, A6, B2 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A5, B3 + MADPS C23, C23, A6, B3 + + MADPS C14, C14, A5, B4 + MADPS C24, C24, A6, B4 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A7, B5 + MADPS C21, C21, A8, B5 + + MADPS C12, C12, A7, B6 + MADPS C22, C22, A8, B6 + + MADPS C13, C13, A7, B7 + MADPS C23, C23, A8, B7 + + MADPS C14, C14, A7, B8 + MADPS C24, C24, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L2210 + PLU B4, B2, B2 + + + .align 4 +.L222: + andi L, K, 2 + blez L, .L227 + NOP + + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE + + daddiu AO, AO, 2 * 4 * SIZE + MADPS C11, C11, A3, B5 + MADPS C21, C21, A4, B5 + gsLQC1(R13, F9, F8, 0) # A1 A2 + + MADPS C12, C12, A3, B6 + MADPS C22, C22, A4, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L227: + andi L, K, 1 + blez L, .L220 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 4 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + .align 4 +.L220: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + CVTU A5, C12 + CVTU A6, C22 + + CVTU A7, C14 + CVTU A8, C24 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, A7, C14 + SUB C24, A8, C24 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, C14, A7 + SUB C24, C24, A8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + NEG C13, C13 + NEG C23, C23 + NEG C14, C14 + NEG C24, C24 + + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + .align 4 .L21: andi I, M, 1 blez I, .L20 NOP + .align 4 +.L211: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + + PLU B3, B1, B1 + blez L, .L212 + PLU B4, B2, B2 + +.L2110: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A3, B1 + MADPS C12, C12, A3, B2 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + + daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A3, B3 + MADPS C14, C14, A3, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A4, B5 + MADPS C12, C12, A4, B6 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C13, C13, A4, B7 + MADPS C14, C14, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L2110 + PLU B4, B2, B2 + + + .align 4 +.L212: + andi L, K, 2 + blez L, .L217 + NOP + + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + daddiu AO, AO, 4 * SIZE + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + gsLQC1(R12, F1, F0, 0) # A5 A6 + gsLQC1(R13, F9, F8, 0) # B1 B2 + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L217: + andi L, K, 1 + blez L, .L210 + NOP + + MADPS C11, C11, A1, B1 + daddiu BO, BO, 4 * SIZE + MADPS C12, C12, A1, B2 + daddiu AO, AO, 2 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + .align 4 +.L210: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + CVTU A5, C12 + CVTU A7, C14 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + SUB C14, A7, C14 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + SUB C14, C14, A7 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + NEG C13, C13 + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + NEG C14, C14 + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + .align 4 .L20: daddiu J, J, -1 @@ -827,6 +1609,835 @@ blez J, .L999 NOP +.L14: + dsra I, M, 2 # MR=8 + move AO, A # Reset A + move CO1, C + + blez I, .L12 + daddu C, CO1, LDC + + .align 4 +.L141: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C31, C11 + MOV C41, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C33, C11 + MOV C43, C11 + + FETCH $0, 8 * SIZE(CO1) + PLU B3, B1, B1 + blez L, .L142 + PLU B4, B2, B2 + +.L1410: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + gsLQC1(R12, F1, F0, 4) # A1 A2 + MADPS C11, C11, A5, B2 + MADPS C21, C21, A6, B2 + + gsLQC1(R12, F3, F2, 5) # A3 A4 + MADPS C31, C31, A7, B2 + MADPS C41, C41, A8, B2 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C13, C13, A5, B4 + MADPS C23, C23, A6, B4 + + MADPS C33, C33, A7, B4 + MADPS C43, C43, A8, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A1, B5 + MADPS C21, C21, A2, B5 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + gsLQC1(R12, F7, F6, 7) # A7 A8 + MADPS C31, C31, A3, B5 + MADPS C41, C41, A4, B5 + + daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A1, B7 + MADPS C23, C23, A2, B7 + + MADPS C33, C33, A3, B7 + MADPS C43, C43, A4, B7 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A5, B6 + MADPS C21, C21, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B6 + MADPS C41, C41, A8, B6 + + MADPS C13, C13, A5, B8 + MADPS C23, C23, A6, B8 + + MADPS C33, C33, A7, B8 + MADPS C43, C43, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L1410 + PLU B4, B2, B2 + + + .align 4 +.L142: + andi L, K, 2 + blez L, .L147 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + gsLQC1(R13, F13, F8, 1) # B3 B4 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A5, B2 + MADPS C21, C21, A6, B2 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B2 + MADPS C41, C41, A8, B2 + daddiu BO, BO, 4 * SIZE # 4KR*4NR + + MADPS C13, C13, A5, B4 + MADPS C23, C23, A6, B4 + + MADPS C33, C33, A7, B4 + MADPS C43, C43, A8, B4 + PLU B3, B1, B1 + + + .align 4 +.L147: + andi L, K, 1 + blez L, .L140 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu AO, AO, 2 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + + .align 4 +.L140: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + NEG C13, C13 # ad'+'cb + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + + bgtz I, .L141 + daddiu CO1, CO1, 8 * SIZE + + .align 4 +.L12: + andi I, M, 2 # MR=4 + blez I, .L11 + NOP + + .align 4 +.L121: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + + PLU B3, B1, B1 + blez L, .L122 + PLU B4, B2, B2 + +.L1210: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C11, C11, A3, B2 + MADPS C21, C21, A4, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C13, C13, A3, B4 + MADPS C23, C23, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A7, B6 + MADPS C21, C21, A8, B6 + + MADPS C13, C13, A7, B8 + MADPS C23, C23, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L1210 + PLU B4, B2, B2 + + + .align 4 +.L122: + andi L, K, 2 + blez L, .L127 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + PLU B7, B5, B5 + daddiu BO, BO, 1 * 4 * SIZE + + daddiu AO, AO, 2 * 4 * SIZE + MADPS C11, C11, A3, B2 + MADPS C21, C21, A4, B2 + + MADPS C13, C13, A3, B4 + MADPS C23, C23, A4, B4 + + gsLQC1(R13, F9, F8, 0) + gsLQC1(R12, F1, F0, 0) + PLU B3, B1, B1 + + .align 4 +.L127: + andi L, K, 1 + blez L, .L120 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + .align 4 +.L120: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + NEG C13, C13 # ad'+'cb + NEG C23, C23 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + + .align 4 +.L11: + andi I, M, 1 + blez I, .L10 + NOP + + .align 4 +.L111: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + + FETCH $0, 0 * SIZE(CO1) + + PLU B3, B1, B1 + blez L, .L112 + PLU B4, B2, B2 + +.L1110: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR + + MADPS C11, C11, A2, B2 + MADPS C13, C13, A2, B4 + + MADPS C11, C11, A3, B5 + MADPS C13, C13, A3, B7 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A4, B6 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C13, C13, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L1110 + PLU B4, B2, B2 + + + .align 4 +.L112: + andi L, K, 2 + blez L, .L117 + NOP + + MADPS C11, C11, A1, B1 + MADPS C13, C13, A1, B3 + daddiu BO, BO, 4 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C11, C11, A2, B2 + MADPS C13, C13, A2, B4 + + gsLQC1(R13, F9, F8, 0) + gsLQC1(R12, F1, F0, 0) + PLU B3, B1, B1 + + + .align 4 +.L117: + andi L, K, 1 + blez L, .L110 + NOP + + daddiu BO, BO, 2 * SIZE + daddiu AO, AO, 2 * SIZE + + MADPS C11, C11, A1, B1 + MADPS C13, C13, A1, B3 + + + .align 4 +.L110: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + NEG C13, C13 + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + .align 4 .L10: move B, BO From ee4bb8bd2554f8cc5c539b2d9fc56d09836a338b Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 16 Sep 2011 16:08:39 +0000 Subject: [PATCH 23/30] Add ctrmm part in cgemm_kernel_loongson3a_4x2_ps.S. --- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 504 +++++++++++++++++- 1 file changed, 491 insertions(+), 13 deletions(-) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index b57213a24..16502216f 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -142,7 +142,7 @@ sd $24, 104($sp) sd $25, 112($sp) - LDARG OFFSET, STACKSIZE($sp) + LDARG OFFSET, STACKSIZE+8($sp) #endif #ifndef __64BIT__ @@ -157,59 +157,132 @@ dsra J, N, 1 # NR=2 ST $f15, 152($sp) +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE blez J, .L1 ST $f16, 160($sp) .L24: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + dsra I, M, 2 # MR=8 move AO, A # Reset A + + dsll PREA, K, 1 + ZBASE_SHIFT move CO1, C daddu CO2, C, LDC + daddu PREA, AO, PREA + blez I, .L22 daddu C, CO2, LDC .align 4 .L241: - move BO, B # Reset B - dsra L, K, 2 # UnRoll K=64 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 + dsll PREB, K, ZBASE_SHIFT MOV C21, C11 MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C31, C11 MOV C32, C11 - gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C41, C11 MOV C42, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C13, C11 MOV C14, C11 - gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C23, C11 - FETCH $0, 0 * SIZE(CO1) - - FETCH $0, 8 * SIZE(CO1) MOV C24, C11 - - MOV C33, C11 - FETCH $0, 0 * SIZE(CO2) - FETCH $0, 8 * SIZE(CO2) + MOV C33, C11 MOV C34, C11 + MOV C43, C11 + MOV C44, C11 PLU B3, B1, B1 PLU B4, B2, B2 + daddu PREB, BO, PREB + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 blez L, .L242 + NOP + +#else + + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, ZBASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MOV C31, C11 + MOV C32, C11 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C41, C11 + MOV C42, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C23, C11 + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 MOV C44, C11 + daddu PREB, BO, PREB + + PLU B3, B1, B1 + PLU B4, B2, B2 + + FETCH $0, 8 * SIZE(CO1) + blez L, .L242 + FETCH $0, 8 * SIZE(CO2) +#endif .L2410: daddiu L, L, -1 @@ -225,9 +298,11 @@ MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 + FETCH $0, 0 * SIZE(PREB) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREA) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 @@ -239,6 +314,7 @@ PLU B7, B5, B5 PLU B8, B6, B6 + daddu PREB, PREB, 8 * SIZE MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -255,6 +331,7 @@ MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 + FETCH $0, 8 * SIZE(PREA) MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 @@ -283,9 +360,10 @@ gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C31, C31, A3, B1 - daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR MADPS C41, C41, A4, B1 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + FETCH $0, 16 * SIZE(PREA) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR @@ -317,11 +395,13 @@ MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 + FETCH $0, 24 * SIZE(PREA) MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 + daddu PREA, PREA, 32 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -339,7 +419,11 @@ .align 4 .L242: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L247 NOP @@ -407,7 +491,11 @@ .align 4 .L247: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L240 NOP @@ -440,6 +528,7 @@ .align 4 .L240: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -891,6 +980,395 @@ #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B3, 2 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B5, 4 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B7, 6 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B2, 1 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B4, 3 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B6, 5 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B8, 7 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, B5, C14 + SUB C24, B6, C24 + SUB C34, B7, C34 + SUB C44, B8, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B1, 0 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B3, 2 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B5, 4 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B7, 6 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B2, 1 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B4, 3 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B6, 5 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST B8, 7 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, C14, B5 + SUB C24, C24, B6 + + SUB C34, C34, B7 + SUB C44, C44, B8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B1, 0 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B3, 2 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B5, 4 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B7, 6 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B2, 1 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B4, 3 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B6, 5 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST B8, 7 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + NEG C13, C13 + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + NEG C14, C14 + NEG C24, C24 + NEG C34, C34 + NEG C44, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B3, 2 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B5, 4 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B7, 6 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B2, 1 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B4, 3 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B6, 5 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B8, 7 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 2 + ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + +#endif daddiu CO1, CO1, 8 * SIZE bgtz I, .L241 daddiu CO2, CO2, 8 * SIZE From e08cfaf9ca9a65e28c4e0f790421aa03e7041c94 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 16 Sep 2011 17:50:40 +0000 Subject: [PATCH 24/30] Complete all the complex single-precision functions of level3, but the performance needs further improve. --- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 1091 ++++++++++++++++- 1 file changed, 1081 insertions(+), 10 deletions(-) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 16502216f..e78ad209f 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -1381,6 +1381,49 @@ .align 4 .L221: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # MR=2 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L222 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -1407,6 +1450,7 @@ PLU B3, B1, B1 blez L, .L222 PLU B4, B2, B2 +#endif .L2210: daddiu L, L, -1 @@ -1484,7 +1528,11 @@ .align 4 .L222: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L227 NOP @@ -1527,7 +1575,11 @@ .align 4 .L227: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L220 NOP @@ -1547,6 +1599,7 @@ .align 4 .L220: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -1800,6 +1853,239 @@ ST B8, 3 * SIZE(CO2) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + CVTU A5, C12 + CVTU A6, C22 + + CVTU A7, C14 + CVTU A8, C24 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, A7, C14 + SUB C24, A8, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, C14, A7 + SUB C24, C24, A8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + NEG C13, C13 + NEG C23, C23 + NEG C14, C14 + NEG C24, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + +#endif daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE @@ -1812,6 +2098,41 @@ .align 4 .L211: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L212 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -1829,6 +2150,7 @@ PLU B3, B1, B1 blez L, .L212 PLU B4, B2, B2 +#endif .L2110: daddiu L, L, -1 @@ -1880,7 +2202,11 @@ .align 4 .L212: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L217 NOP @@ -1910,7 +2236,11 @@ .align 4 .L217: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L210 NOP @@ -1924,13 +2254,13 @@ .align 4 .L210: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 CVTU A5, C12 CVTU A7, C14 - #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd @@ -2069,6 +2399,149 @@ ST B6, 1 * SIZE(CO2) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + CVTU A5, C12 + CVTU A7, C14 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + SUB C14, A7, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + SUB C14, C14, A7 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + NEG C13, C13 + NEG C14, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + +#endif daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE @@ -2077,6 +2550,11 @@ .L20: daddiu J, J, -1 move B, BO + +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + bgtz J, .L24 NOP @@ -2090,13 +2568,58 @@ .L14: dsra I, M, 2 # MR=8 move AO, A # Reset A - move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move CO1, C blez I, .L12 daddu C, CO1, LDC .align 4 .L141: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C31, C11 + MOV C41, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C33, C11 + MOV C43, C11 + + FETCH $0, 8 * SIZE(CO1) + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 # define Mr=4 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L142 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -2120,6 +2643,7 @@ PLU B3, B1, B1 blez L, .L142 PLU B4, B2, B2 +#endif .L1410: daddiu L, L, -1 @@ -2193,7 +2717,11 @@ .align 4 .L142: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L147 NOP @@ -2232,7 +2760,11 @@ .align 4 .L147: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L140 NOP @@ -2253,6 +2785,7 @@ .align 4 .L140: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -2433,20 +2966,20 @@ #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ - SUB C11, C11, A1 # ac'+'bd + SUB C11, C11, A1 # AC'+'BD SUB C21, C21, A2 SUB C31, C31, A3 - LD A1, 152($sp) # load alpha_r -# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # LOAD ALPHA_R +# LD A1, 0 * SIZE(A) # LOAD ALPHA_R SUB C41, C41, A4 LD A2, 160($sp) -# LD A2, 0 * SIZE(A) # load alpha_i +# LD A2, 0 * SIZE(A) # LOAD ALPHA_I - ADD C13, A5, C13 # ad'+'cb + ADD C13, A5, C13 # AD'+'CB ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 - NEG C13, C13 # ad'+'cb + NEG C13, C13 # AD'+'CB NEG C23, C23 NEG C33, C33 NEG C43, C43 @@ -2461,7 +2994,7 @@ LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) - MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B1, B1, C11, A1 # A1 = ALPHA_R MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 @@ -2469,6 +3002,74 @@ MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = ALPHA_I + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 @@ -2488,6 +3089,159 @@ ST B8, 7 * SIZE(CO1) #endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # AC'+'BD + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # LOAD ALPHA_R +# LD A1, 0 * SIZE(A) # LOAD ALPHA_R + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # LOAD ALPHA_I + + ADD C13, A5, C13 # AD'+'CB + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + NEG C13, C13 # AD'+'CB + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + + MUL B1, C11, A1 # A1 = ALPHA_R + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = ALPHA_I + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 2 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + +#endif bgtz I, .L141 daddiu CO1, CO1, 8 * SIZE @@ -2499,6 +3253,42 @@ .align 4 .L121: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L122 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -2516,6 +3306,7 @@ PLU B3, B1, B1 blez L, .L122 PLU B4, B2, B2 +#endif .L1210: daddiu L, L, -1 @@ -2561,7 +3352,11 @@ .align 4 .L122: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L127 NOP @@ -2588,7 +3383,11 @@ .align 4 .L127: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L120 NOP @@ -2602,6 +3401,7 @@ .align 4 .L120: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -2737,6 +3537,141 @@ ST B4, 3 * SIZE(CO1) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + NEG C13, C13 # ad'+'cb + NEG C23, C23 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + +#endif daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE @@ -2749,6 +3684,37 @@ .align 4 .L111: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + + FETCH $0, 0 * SIZE(CO1) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L112 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -2763,6 +3729,7 @@ PLU B3, B1, B1 blez L, .L112 PLU B4, B2, B2 +#endif .L1110: daddiu L, L, -1 @@ -2796,7 +3763,11 @@ .align 4 .L112: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L117 NOP @@ -2815,7 +3786,11 @@ .align 4 .L117: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L110 NOP @@ -2828,11 +3803,11 @@ .align 4 .L110: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 - #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd @@ -2912,6 +3887,99 @@ ST B2, 1 * SIZE(CO1) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + NEG C13, C13 + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + +#endif daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE @@ -2919,6 +3987,9 @@ .align 4 .L10: move B, BO +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 1 +#endif .L999: ld $16, 0($sp) From 831858b88351e350c9f6ad5c2d7f0c70d4cce18c Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 23 Sep 2011 20:59:48 +0000 Subject: [PATCH 25/30] Modify aligned address of sa and sb to improve the performance of multi-threads. --- driver/level3/gemm_thread_n.c | 6 +++--- driver/others/parameter.c | 4 ++-- param.h | 25 +++++++------------------ 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index ba54612eb..62907fa65 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,15 +71,15 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; + queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; //NULL; + queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;//NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { queue[0].sa = sa; - queue[0].sb = sb; + queue[0].sb = sa + GEMM_OFFSET_A1 * 5; queue[num_cpu - 1].next = NULL; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 4a8542a93..fc7f0447e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -688,11 +688,11 @@ void blas_set_parameter(void){ if(blas_num_threads == 1){ #endif //single thread - dgemm_r = 1000; + dgemm_r = 1024; #ifdef SMP }else{ //multi thread - dgemm_r = 300; + dgemm_r = 200; } #endif #endif diff --git a/param.h b/param.h index 1c729e8b9..4ffe05cf8 100644 --- a/param.h +++ b/param.h @@ -1493,33 +1493,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_P 44 #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 112 -#define CGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1024 -//#define DGEMM_DEFAULT_R 300 -//#define DGEMM_DEFAULT_R 200 -//#define DGEMM_DEFAULT_R 400 -//#define DGEMM_DEFAULT_R 192 -#define DGEMM_DEFAULT_R dgemm_r -//1000 -//#define DGEMM_DEFAULT_R 160 -//#define DGEMM_DEFAULT_R 270 +#define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R 1024 -//#define ZGEMM_DEFAULT_R 1000 -#define ZGEMM_DEFAULT_R 1000 +#define ZGEMM_DEFAULT_R 1024 -#define GEMM_OFFSET_A1 (DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*SIZE) -//#define GEMM_OFFSET_B1 0x10 -#define GEMM_OFFSET_B1 (DGEMM_DEFAULT_Q*DGEMM_DEFAULT_R*SIZE) -#define GEMM_OFFSET 0x100000 -#define GEMM_OFFSET1 0x40000 +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 #define SYMV_P 16 #endif From 9fe3049de67495e1ca916141624c985a80f3d6cb Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 26 Sep 2011 15:21:45 +0000 Subject: [PATCH 26/30] Adding conditional compilation(#if defined(LOONGSON3A)) to avoid affecting the performance of other platforms. --- driver/level3/gemm_thread_n.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index 62907fa65..f9007f831 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; - queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; //NULL; - queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;//NULL; +#if defined(LOONGSON3A) + queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; + queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; +#else + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; +#endif queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { +#if defined(LOONGSON3A) queue[0].sa = sa; queue[0].sb = sa + GEMM_OFFSET_A1 * 5; - +#else + queue[0].sa = sa; + queue[0].sb = sb; +#endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, From c1e618ea2d7fc44c6e90c2cb728124249e688947 Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 3 Nov 2011 13:53:48 +0000 Subject: [PATCH 27/30] Add complete gemv function on Loongson3a platform. --- kernel/mips64/KERNEL.LOONGSON3A | 10 +++ kernel/mips64/gemv_n_loongson3a.c | 98 ++++++++++++++++++++++++++++++ kernel/mips64/gemv_t_loongson3a.c | 93 ++++++++++++++++++++++++++++ kernel/mips64/zgemv_n_loongson3a.c | 92 ++++++++++++++++++++++++++++ kernel/mips64/zgemv_t_loongson3a.c | 91 +++++++++++++++++++++++++++ 5 files changed, 384 insertions(+) create mode 100644 kernel/mips64/gemv_n_loongson3a.c create mode 100644 kernel/mips64/gemv_t_loongson3a.c create mode 100644 kernel/mips64/zgemv_n_loongson3a.c create mode 100644 kernel/mips64/zgemv_t_loongson3a.c diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 91f2e7dd1..fc247e473 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,6 +1,16 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c + + SGEMMKERNEL = sgemm_kernel_8x4_ps.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c new file mode 100644 index 000000000..bb27379f5 --- /dev/null +++ b/kernel/mips64/gemv_n_loongson3a.c @@ -0,0 +1,98 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) +#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) +#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) +#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) +{ + + if(!ALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 4; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c new file mode 100644 index 000000000..5c6c8389e --- /dev/null +++ b/kernel/mips64/gemv_t_loongson3a.c @@ -0,0 +1,93 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) +#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) +#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) +#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!ALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 3; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c new file mode 100644 index 000000000..f8275c371 --- /dev/null +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -0,0 +1,92 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) +//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) +#define spec_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define norm_loop_alpha1 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) +#define norm_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 60; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj=0; + + + if(rALPHA == 1 && iALPHA == 0) { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c new file mode 100644 index 000000000..4b2c2b6b5 --- /dev/null +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -0,0 +1,91 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) +#define spec_loop do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define norm_loop_alpha1 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj=0; + + + if(rALPHA == 1 && iALPHA == 0) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +} From a32e56500ac4cfad0e60a6a4f7671bfee54195e6 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 4 Nov 2011 19:32:21 +0000 Subject: [PATCH 28/30] Fix the compute error of gemv when incx and incy are negative numbers. --- kernel/mips64/gemv_n_loongson3a.c | 23 +++++++++++++---------- kernel/mips64/gemv_t_loongson3a.c | 8 ++++---- kernel/mips64/zgemv_n_loongson3a.c | 8 ++++---- kernel/mips64/zgemv_t_loongson3a.c | 8 ++++---- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c index bb27379f5..7db595449 100644 --- a/kernel/mips64/gemv_n_loongson3a.c +++ b/kernel/mips64/gemv_n_loongson3a.c @@ -16,13 +16,16 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + BLASLONG kx=0, ky=0; if(!ALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; + //if(INCX < 0) + // kx = (1-N) * INCX; + // INCX = -INCX; + //if(INCY < 0) + // ky = (1-M) * INCY; + // INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 4; @@ -31,7 +34,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO if(ALPHA == 1) { if(INCY == 1) { - for(; likely(j < N); j++, k += INCX) { + for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); @@ -46,8 +49,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO } } } else { - for(; likely(j < N); j++, k += INCX) { - BLASLONG i = 0, h = 0; + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = ky; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[h + fahead]); @@ -63,7 +66,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO } } else { if(INCY == 1) { - for(; likely(j < N); j++, k += INCX) { + for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); @@ -78,8 +81,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO } } } else { - for(; likely(j < N); j++, k += INCX) { - BLASLONG i = 0, h = 0; + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = ky; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[h + fahead]); diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c index 5c6c8389e..51f035d8e 100644 --- a/kernel/mips64/gemv_t_loongson3a.c +++ b/kernel/mips64/gemv_t_loongson3a.c @@ -18,10 +18,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO if(!ALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; +// if(INCX < 0) +// INCX = -INCX; +// if(INCY < 0) +// INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 3; diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c index f8275c371..7b094de80 100644 --- a/kernel/mips64/zgemv_n_loongson3a.c +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -19,10 +19,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, F if(!rALPHA && iALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; +// if(INCX < 0) +// INCX = -INCX; +// if(INCY < 0) +// INCY = -INCY; BLASLONG fahead = 60; BLASLONG spec_unroll = 2; diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c index 4b2c2b6b5..3835879ad 100644 --- a/kernel/mips64/zgemv_t_loongson3a.c +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -18,10 +18,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, F if(!rALPHA && iALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; +// if(INCX < 0) +// INCX = -INCX; +// if(INCY < 0) +// INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 2; From 2d78fb05c8a2fda923fec94aeb5eb16f1bf7671f Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 10 Nov 2011 15:38:48 +0000 Subject: [PATCH 29/30] Add conjugate condition to gemv. --- kernel/mips64/zgemv_n_loongson3a.c | 79 ++++++++++++++++++++++++------ kernel/mips64/zgemv_t_loongson3a.c | 66 +++++++++++++++++++------ 2 files changed, 113 insertions(+), 32 deletions(-) diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c index 7b094de80..3b1b6f73b 100644 --- a/kernel/mips64/zgemv_n_loongson3a.c +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -1,34 +1,81 @@ -#include "common.h" +#include "common.h" -//These are auto-tuning codes on Loongson-3A platform. +//typedef int BLASLONG; +//typedef double FLOAT; -//#define prefetch(x) __builtin_prefetch(x) -//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) -#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) -//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) -#define spec_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) -#define norm_loop_alpha1 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) -#define norm_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) +#if !defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_0 +#define spec_loop spec_loop_0 +#define norm_loop_alpha1 norm_loop_alpha1_0 +#define norm_loop norm_loop_0 +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_1 +#define spec_loop spec_loop_1 +#define norm_loop_alpha1 norm_loop_alpha1_1 +#define norm_loop norm_loop_1 +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_2 +#define spec_loop spec_loop_2 +#define norm_loop_alpha1 norm_loop_alpha1_2 +#define norm_loop norm_loop_2 +#endif + +#if defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_3 +#define spec_loop spec_loop_3 +#define norm_loop_alpha1 norm_loop_alpha1_3 +#define norm_loop norm_loop_3 +#endif + +#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { if(!rALPHA && iALPHA) return 0; -// if(INCX < 0) -// INCX = -INCX; -// if(INCY < 0) -// INCY = -INCY; - BLASLONG fahead = 60; BLASLONG spec_unroll = 2; BLASLONG tMQ = M - M % spec_unroll; - BLASLONG j = 0, k = 0, jj=0; - + BLASLONG j = 0, k = 0, jj = 0; if(rALPHA == 1 && iALPHA == 0) { if(INCY == 1) { diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c index 3835879ad..3af44caf2 100644 --- a/kernel/mips64/zgemv_t_loongson3a.c +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -1,33 +1,67 @@ -#include "common.h" +#include "common.h" -//These are auto-tuning codes on Loongson-3A platform. -//#define prefetch(x) __builtin_prefetch(x) -//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) -#define spec_loop_alpha1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) -//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) -#define spec_loop do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) -#define norm_loop_alpha1 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) -#define norm_loop do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#if !defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_0 +#define spec_loop spec_loop_0 +#define norm_loop_alpha1 norm_loop_alpha1_0 +#define norm_loop norm_loop_0 +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_1 +#define spec_loop spec_loop_1 +#define norm_loop_alpha1 norm_loop_alpha1_1 +#define norm_loop norm_loop_1 +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_2 +#define spec_loop spec_loop_2 +#define norm_loop_alpha1 norm_loop_alpha1_2 +#define norm_loop norm_loop_2 +#endif + +#if defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_3 +#define spec_loop spec_loop_3 +#define norm_loop_alpha1 norm_loop_alpha1_3 +#define norm_loop norm_loop_3 +#endif + + +#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) + +#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) + +#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { if(!rALPHA && iALPHA) return 0; -// if(INCX < 0) -// INCX = -INCX; -// if(INCY < 0) -// INCY = -INCY; - BLASLONG fahead = 30; BLASLONG spec_unroll = 2; BLASLONG tMQ = M - M % spec_unroll; - BLASLONG j = 0, k = 0, jj=0; - + BLASLONG j = 0, k = 0, jj = 0; if(rALPHA == 1 && iALPHA == 0) { if(INCX == 1) { From a4292976e91eeab0c0e8aa8e6b81a9074e9933cb Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 5 Dec 2011 14:54:25 +0000 Subject: [PATCH 30/30] Adding detection of complex situations in symm.c, otherwise the buffer address of sb will overlap the end of sa. --- interface/symm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/interface/symm.c b/interface/symm.c index a0d52c49d..b447f13e8 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY)