From 1c96d345e22efcc68df697584282d8cc00361e18 Mon Sep 17 00:00:00 2001 From: traz Date: Tue, 21 Jun 2011 22:16:23 +0000 Subject: [PATCH 01/52] Improve zgemm performance from 1G to 1.8G, change block size in param.h. --- kernel/mips64/KERNEL.LOONGSON3A | 6 + kernel/mips64/zgemm_kernel_loongson3a.S | 923 ++++++++++++++++++++++++ param.h | 13 +- 3 files changed, 936 insertions(+), 6 deletions(-) create mode 100644 kernel/mips64/zgemm_kernel_loongson3a.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index e72ac142e..94c8b1b9a 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -13,6 +13,12 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +ZGEMMKERNEL = zgemm_kernel_loongson3a.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S new file mode 100644 index 000000000..0b0d73137 --- /dev/null +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -0,0 +1,923 @@ +#define ASSEMBLER +#include "common.h" + + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + + dsra J, N, 1 # J=N/2 + ST ALPHA_I, 136($sp) + + dsll PREB, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + blez J, .L20 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + .align 5 +.L10: + daddiu J, J, -1 + move CO1, C # Fix pointer Cx + + daddu CO2, C, LDC + move AO, A # Reset AO + + dsra I, M, 1 # I=M/2 + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: + dsra L, K, 2 # Unroll K 4 times + move BO, B + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c21, c11 + MOV c22, c11 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(PREA) # LOAD 32 Byte 4 double + daddu PREB, PREB, B # PREA=A+panel size + + FETCH $0, 0 * SIZE(CO1) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO2) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c41, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c42, c11 + MOV c43, c11 + + FETCH $0, 4 * SIZE(CO2) + blez L, .L15 + MOV c44, c11 + + .align 5 + +.L12: + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F13, F12, 2) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F11, F10, 3) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + gsLQC1(R13, F16, F15, 3) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 4) # Unroll K=2 + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + gsLQC1(R13, F5, F4, 4) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + gsLQC1(R12, F3, F2, 5) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + gsLQC1(R13, F7, F6, 5) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + gsLQC1(R12, F9, F8, 6) # Unroll K=3 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + daddiu L, L, -1 + + gsLQC1(R13, F13, F12, 6) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F11, F10, 7) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + gsLQC1(R13, F16, F15, 7) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddu PREA, PREA, 16 * SIZE + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREB, PREB, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: + andi L, K, 3 + LD ALPHA_R, 128($sp) + NOP + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu L, L, -1 + daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + bgtz L, .L16 + NOP + +.L18: + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + ST a1, 0 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST a2, 1 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b1, 2 * SIZE(CO1) + + ST b2, 3 * SIZE(CO1) + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + blez I, .L19 + daddu C, C, LDC # Change C to next panel + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + MOV c12, c11 + MOV c13, c11 + MOV c14, c11 + + MOV c31, c11 + MOV c32, c11 + MOV c33, c11 + MOV c34, c11 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + blez L, .L35 + NOP + + .align 3 + +.L32: + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F13, F12, 2) + gsLQC1(R13, F16, F15, 3) + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F5, F4, 4) + gsLQC1(R13, F7, F6, 5) + + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + + daddiu L, L, -1 + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F13, F12, 6) + gsLQC1(R13, F16, F15, 7) + + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + MADD1 c31, c31, a7, b7 # A1xB2 + MADD3 c33, c33, a7, b8 + MADD2 c32, c32, a8, b7 + MADD4 c34, c34, a8, b8 + + bgtz L, .L32 + NOP + + .align 3 + +.L35: + andi L, K, 3 + LD ALPHA_R, 128($sp) + LD ALPHA_I, 136($sp) + blez L, .L38 + NOP + .align 3 + +.L36: + daddiu L, L, -1 + daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + bgtz L, .L36 + NOP + +.L38: + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + LD a3, 0 * SIZE(CO2) + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + + .align 3 + +.L19: + bgtz J, .L10 + move B, BO + + .align 3 + +.L20: + andi J, N, 1 + blez J, .L999 + NOP + + move CO1, C + move AO, A # Reset AO + + dsra I, M, 1 # I=M/2 + blez I, .L29 + NOP + +.L21: + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + MOV c12, c11 + MOV c13, c11 + MOV c14, c11 + + MOV c21, c11 + MOV c22, c11 + MOV c23, c11 + MOV c24, c11 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + blez L, .L25 + NOP + + .align 3 + +.L22: + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + gsLQC1(R12, F1, F0, 4) # Unroll K=2 + gsLQC1(R12, F3, F2, 5) + gsLQC1(R13, F13, F12, 2) + + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + + + daddiu L, L, -1 + gsLQC1(R12, F9, F8, 6) # Unroll K=3 + gsLQC1(R12, F11, F10, 7) + gsLQC1(R13, F16, F15, 3) + + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + MADD2 c22, c22, a8, b7 + MADD4 c24, c24, a8, b8 + + bgtz L, .L22 + NOP + + .align 3 + +.L25: + andi L, K, 3 + LD ALPHA_R, 128($sp) + LD ALPHA_I, 136($sp) + blez L, .L28 + NOP + .align 3 + +.L26: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + bgtz L, .L26 + NOP + +.L28: + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + LD b1, 2 * SIZE(CO1) + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + MOV c12, c11 + MOV c13, c11 + MOV c14, c11 + + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + blez L, .L45 + NOP + + .align 3 + +.L42: + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F13, F12, 2) + + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + + daddiu L, L, -1 + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F16, F15, 3) + + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + .align 3 + +.L45: + andi L, K, 3 + LD ALPHA_R, 128($sp) + LD ALPHA_I, 136($sp) + blez L, .L48 + NOP + .align 3 + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + bgtz L, .L46 + NOP + +.L48: + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + daddiu CO1,CO1, 2 * SIZE + + + + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/param.h b/param.h index 603caab46..b7f0d662a 100644 --- a/param.h +++ b/param.h @@ -1488,23 +1488,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 32 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 108 -#define ZGEMM_DEFAULT_P 112 +#define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116 #define CGEMM_DEFAULT_Q 144 -#define ZGEMM_DEFAULT_Q 72 +#define ZGEMM_DEFAULT_Q 60 #define SGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000 #define CGEMM_DEFAULT_R 2000 -#define ZGEMM_DEFAULT_R 2000 +#define ZGEMM_DEFAULT_R 1000 #define SYMV_P 16 #endif From 14f81da375232998e8c1f149ab61db43bfb300af Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 23 Jun 2011 10:46:58 +0000 Subject: [PATCH 02/52] Change prefetch length of A and B, the performance is 2.1G now. --- kernel/mips64/zgemm_kernel_loongson3a.S | 373 +++++++++++++----------- 1 file changed, 207 insertions(+), 166 deletions(-) diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S index 0b0d73137..49603675a 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -6,6 +6,7 @@ #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + #define STACKSIZE 160 #define M $4 #define N $5 @@ -109,12 +110,18 @@ #define ALPHA_R $f15 #define ALPHA_I $f16 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +####if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB -#endif +###endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD @@ -166,25 +173,28 @@ sdc1 $f23,112($sp) #endif - dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i - dsra J, N, 1 # J=N/2 + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 ST ALPHA_I, 136($sp) - dsll PREB, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 - blez J, .L20 - dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 .align 5 .L10: daddiu J, J, -1 - move CO1, C # Fix pointer Cx - - daddu CO2, C, LDC - move AO, A # Reset AO - dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size @@ -192,41 +202,32 @@ dsra L, K, 2 # Unroll K 4 times move BO, B - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 - - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c23, c11 MOV c24, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - FETCH $0, 0 * SIZE(PREA) # LOAD 32 Byte 4 double - daddu PREB, PREB, B # PREA=A+panel size - - FETCH $0, 0 * SIZE(CO1) MOV c31, c11 MOV c32, c11 - - FETCH $0, 0 * SIZE(CO2) + MOV c33, c11 MOV c34, c11 - - FETCH $0, 0 * SIZE(PREB) + MOV c41, c11 - - FETCH $0, 4 * SIZE(CO1) MOV c42, c11 + MOV c43, c11 - - FETCH $0, 4 * SIZE(CO2) blez L, .L15 MOV c44, c11 @@ -234,26 +235,26 @@ .L12: gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F13, F12, 2) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - gsLQC1(R13, F13, F12, 2) + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - gsLQC1(R13, F16, F15, 3) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -262,27 +263,27 @@ MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 - gsLQC1(R12, F1, F0, 4) # Unroll K=2 + gsLQC1(R12, F1, F0, 4) # unroll k=2 + gsLQC1(R13, F5, F4, 4) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - gsLQC1(R13, F5, F4, 4) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd gsLQC1(R12, F3, F2, 5) + gsLQC1(R13, F7, F6, 5) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 - gsLQC1(R13, F7, F6, 5) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 - FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 @@ -292,61 +293,61 @@ MADD4 c44, c44, a8, b8 gsLQC1(R12, F9, F8, 6) # Unroll K=3 + gsLQC1(R13, F13, F12, 6) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - daddiu L, L, -1 - gsLQC1(R13, F13, F12, 6) + gsLQC1(R13, F16, F15, 7) + gsLQC1(R12, F11, F10, 7) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx - gsLQC1(R13, F16, F15, 7) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx FETCH $0, 12 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + daddiu L, L, -1 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 - daddu PREA, PREA, 16 * SIZE MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE daddu PREB, PREB, 16 * SIZE MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 - FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 @@ -362,46 +363,52 @@ .L15: andi L, K, 3 LD ALPHA_R, 128($sp) - NOP blez L, .L18 LD ALPHA_I, 136($sp) .align 5 .L16: - daddiu L, L, -1 - daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx - + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 + FETCH $0, 0 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 + FETCH $0, 0 * SIZE(PREB) MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - bgtz L, .L16 NOP .L18: + ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -426,170 +433,196 @@ MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 - MADD a3, a3, ALPHA_R, c31 - MADD a4, a4, ALPHA_R, c32 - MADD b3, b3, ALPHA_R, c41 - MADD b4, b4, ALPHA_R, c42 - NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 - ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 - ST b1, 2 * SIZE(CO1) - ST b2, 3 * SIZE(CO1) + ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO1) + FETCH $0, 12 * SIZE(CO2) + FETCH $0, 12 * SIZE(CO1) + FETCH $0, 16 * SIZE(CO2) + FETCH $0, 16 * SIZE(CO1) + daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE + .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size blez I, .L19 daddu C, C, LDC # Change C to next panel dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c31, c11 MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + MOV c33, c11 + blez L, .L35 MOV c34, c11 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - - blez L, .L35 - NOP - - .align 3 + .align 5 .L32: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F13, F12, 2) - gsLQC1(R13, F16, F15, 3) - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + NOP MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 + NOP gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R13, F5, F4, 4) - gsLQC1(R13, F7, F6, 5) - MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd + + gsLQC1(R13, F7, F6, 5) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd + NOP MADD1 c31, c31, a3, b7 # A1xB2 MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a4, b7 MADD4 c34, c34, a4, b8 - daddiu L, L, -1 + gsLQC1(R12, F11, F10, 3) gsLQC1(R13, F13, F12, 6) - gsLQC1(R13, F16, F15, 7) - MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd + + gsLQC1(R13, F16, F15, 7) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd - - daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx MADD1 c31, c31, a5, b3 # A1xB2 MADD3 c33, c33, a5, b4 + + FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a6, b3 MADD4 c34, c34, a6, b4 + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a8, b5 # bxc MADD4 c14, c14, a8, b6 # bxd + daddiu PREB, PREB, 16 * SIZE MADD1 c31, c31, a7, b7 # A1xB2 MADD3 c33, c33, a7, b8 + + FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a8, b7 + bgtz L, .L32 MADD4 c34, c34, a8, b8 - bgtz L, .L32 - NOP - - .align 3 .L35: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) - blez L, .L38 NOP - .align 3 + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 .L36: - daddiu L, L, -1 - daddiu BO, BO, 2 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + NOP bgtz L, .L36 - NOP + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 .L38: ADD c11, c14, c11 - ADD c12, c13, c12 - - ADD c31, c34, c31 - ADD c32, c33, c32 - LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) + ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) MADD a1, a1, ALPHA_R, c11 @@ -613,43 +646,48 @@ daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE - .align 3 + .align 5 .L19: bgtz J, .L10 move B, BO - .align 3 + .align 5 .L20: andi J, N, 1 blez J, .L999 - NOP - - move CO1, C - move AO, A # Reset AO + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 dsra I, M, 1 # I=M/2 + move CO1, C + + move AO, A # Reset AO blez I, .L29 - NOP + daddu PREA, PREA, A .L21: dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) blez L, .L25 NOP @@ -658,110 +696,116 @@ .L22: gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 gsLQC1(R12, F1, F0, 4) # Unroll K=2 - gsLQC1(R12, F3, F2, 5) - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd + + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd + gsLQC1(R12, F3, F2, 5) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) MADD2 c22, c22, a8, b3 MADD4 c24, c24, a8, b4 - - daddiu L, L, -1 - gsLQC1(R12, F9, F8, 6) # Unroll K=3 - gsLQC1(R12, F11, F10, 7) - gsLQC1(R13, F16, F15, 3) + gsLQC1(R12, F9, F8, 6) # Unroll K=3 MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd + + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd - daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx - daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx - + gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd + daddiu PREA, PREA, 16 * SIZE + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a6, b7 # bxc MADD4 c14, c14, a6, b8 # bxd + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c21, c21, a7, b7 # A2xB1 MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) MADD2 c22, c22, a8, b7 + bgtz L, .L22 MADD4 c24, c24, a8, b8 - bgtz L, .L22 - NOP - - .align 3 .L25: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) + blez L, .L28 - NOP + LD ALPHA_I, 136($sp) .align 3 .L26: - daddiu L, L, -1 - daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx - daddiu AO, AO, 2 * SIZE * COMPSIZE # 2mr*1kr*cmpx + daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 bgtz L, .L26 - NOP + FETCH $0, 0 * SIZE(PREA) .L28: ADD c11, c14, c11 - ADD c12, c13, c12 - ADD c21, c24, c21 - ADD c22, c23, c22 - LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) daddiu I, I, -1 @@ -792,15 +836,16 @@ dsra L, K, 2 # Unroll K 4 times move BO, B + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 - - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP @@ -808,53 +853,49 @@ .L42: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 - MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd + + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd - daddiu L, L, -1 - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F16, F15, 3) + gsLQC1(R12, F11, F10, 3) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd - MADD2 c12, c12, a6, b5 # bxc - MADD4 c14, c14, a6, b6 # bxd - - daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F16, F15, 3) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a8, b7 # bxc + bgtz L, .L42 MADD4 c14, c14, a8, b8 # bxd - bgtz L, .L42 - NOP - .align 3 + .align 5 .L45: andi L, K, 3 LD ALPHA_R, 128($sp) - LD ALPHA_I, 136($sp) blez L, .L48 - NOP - .align 3 + LD ALPHA_I, 136($sp) .L46: daddiu L, L, -1 @@ -892,7 +933,7 @@ - .align 3 + .align 5 .L999: LDARG $16, 0($sp) From e72113f06a33c7e8dfc799fa1edf3f85f5dd6fc1 Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 23 Jun 2011 21:11:00 +0000 Subject: [PATCH 03/52] Add ztrmm and ztrsm part on loongson3a. The average performance is 2.2G. --- kernel/mips64/KERNEL | 11 + kernel/mips64/KERNEL.LOONGSON3A | 9 + kernel/mips64/zgemm_kernel_loongson3a.S | 441 ++++++++++++++++++++++-- param.h | 2 +- 4 files changed, 438 insertions(+), 25 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index ebb447b11..a14b1cb38 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -128,10 +128,21 @@ CTRSMKERNEL_LT = ztrsm_kernel_LT.S CTRSMKERNEL_RN = ztrsm_kernel_LT.S CTRSMKERNEL_RT = ztrsm_kernel_RT.S +ifndef ZTRSMKERNEL_LN ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_LT ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RN ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RT ZTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 94c8b1b9a..706f48128 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -28,3 +28,12 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S index 49603675a..13022f698 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -1,12 +1,10 @@ #define ASSEMBLER #include "common.h" - #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) - #define STACKSIZE 160 #define M $4 #define N $5 @@ -116,12 +114,12 @@ ## MADD3 a*d ## MADD4 d*b ################################## -####if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB -###endif +#endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD @@ -175,6 +173,9 @@ dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE blez J, .L20 @@ -183,6 +184,10 @@ .align 5 .L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + daddiu J, J, -1 dsra I, M, 1 # I=M/2 @@ -193,12 +198,66 @@ daddu CO2, C, LDC move AO, A # Reset AO - daddu PREB, PREB, B # PREA=A+panel size - blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size .L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2 + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, c11 # Clear results regs + MOV c12, c11 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + + MOV c13, c11 + MOV c14, c11 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + + MOV c21, c11 + MOV c22, c11 + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + + MOV c23, c11 + MOV c24, c11 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times move BO, B @@ -218,18 +277,25 @@ MOV c24, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + FETCH $0, 0 * SIZE(CO2) MOV c31, c11 MOV c32, c11 + FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 + FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 + FETCH $0, 4 * SIZE(CO1) MOV c43, c11 + + daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 MOV c44, c11 +#endif .align 5 @@ -361,8 +427,13 @@ .align 5 .L15: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L18 LD ALPHA_I, 136($sp) @@ -408,7 +479,7 @@ NOP .L18: - +#ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -458,20 +529,75 @@ ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) - FETCH $0, 4 * SIZE(CO2) - FETCH $0, 4 * SIZE(CO1) - FETCH $0, 8 * SIZE(CO2) - FETCH $0, 8 * SIZE(CO1) - FETCH $0, 12 * SIZE(CO2) - FETCH $0, 12 * SIZE(CO1) - FETCH $0, 16 * SIZE(CO2) - FETCH $0, 16 * SIZE(CO1) +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE - + .align 5 .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel @@ -480,22 +606,69 @@ blez I, .L19 daddu C, C, LDC # Change C to next panel - dsra L, K, 2 # Unroll K 4 times +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + move BO, B + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MTC $0, c11 # Clear results regs MOV c12, c11 - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c13, c11 MOV c14, c11 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + FETCH $0, 0 * SIZE(PREB) MOV c31, c11 MOV c32, c11 + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + dsra L, K, 2 # Unroll K 4 times + move BO, B + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + MOV c13, c11 + MOV c14, c11 + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) @@ -504,6 +677,7 @@ MOV c33, c11 blez L, .L35 MOV c34, c11 +#endif .align 5 @@ -582,15 +756,18 @@ .L35: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) - NOP +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L38 LD ALPHA_I, 136($sp) .align 5 .L36: - daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd @@ -615,6 +792,7 @@ gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 .L38: +#ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -645,10 +823,60 @@ daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .align 5 .L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + bgtz J, .L10 move B, BO @@ -662,11 +890,56 @@ dsra I, M, 1 # I=M/2 move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move AO, A # Reset AO blez I, .L29 daddu PREA, PREA, A .L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else dsra L, K, 2 # Unroll K 4 times move BO, B @@ -691,8 +964,9 @@ blez L, .L25 NOP +#endif - .align 3 + .align 5 .L22: gsLQC1(R12, F9, F8, 2) # Unroll K=1 @@ -766,15 +1040,18 @@ .L25: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) - +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L28 LD ALPHA_I, 136($sp) .align 3 .L26: - daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd @@ -799,6 +1076,7 @@ FETCH $0, 0 * SIZE(PREA) .L28: +#ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 @@ -824,6 +1102,48 @@ ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP @@ -833,6 +1153,39 @@ blez I, .L999 NOP +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + MTC $0, c11 # Clear results regs + MOV c12, c11 + + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else dsra L, K, 2 # Unroll K 4 times move BO, B @@ -848,6 +1201,7 @@ FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP +#endif .align 3 @@ -892,8 +1246,13 @@ .align 5 .L45: +#ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif blez L, .L48 LD ALPHA_I, 136($sp) @@ -914,6 +1273,7 @@ NOP .L48: +#ifndef TRMMKERNEL ADD c11, c14, c11 ADD c12, c13, c12 @@ -929,7 +1289,40 @@ ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + daddiu CO1,CO1, 2 * SIZE +#endif diff --git a/param.h b/param.h index b7f0d662a..cab3e68dd 100644 --- a/param.h +++ b/param.h @@ -1500,7 +1500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116 #define CGEMM_DEFAULT_Q 144 -#define ZGEMM_DEFAULT_Q 60 +#define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000 From 708d2b625504a67c5385efbff8010ea5c7a2b98e Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 24 Jun 2011 09:27:41 +0000 Subject: [PATCH 04/52] Fix compute error in ztrmm. --- kernel/mips64/zgemm_kernel_loongson3a.S | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a.S index 13022f698..4cc396614 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a.S @@ -618,28 +618,26 @@ #endif gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - move BO, B - - gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MTC $0, c11 # Clear results regs MOV c12, c11 - gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 - FETCH $0, 0 * SIZE(PREB) + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c31, c11 MOV c32, c11 + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) - MOV c33, c11 - MOV c34, c11 - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) From c8360e3ae5793e9285c74f7e78d33c0b10653a91 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 18 Jul 2011 17:03:38 +0000 Subject: [PATCH 05/52] Complete all the plura single precision functions of level3 on Loongson3a, the performance is 2.3GFlops. --- kernel/mips64/KERNEL | 11 + kernel/mips64/KERNEL.LOONGSON3A | 17 +- kernel/mips64/cgemm_kernel_loongson3a_2x2.S | 1468 +++++++++++++++++ ...gson3a.S => dgemm_kernel_loongson3a_4x4.S} | 0 ...gson3a.S => sgemm_kernel_loongson3a_4x4.S} | 0 ...gson3a.S => zgemm_kernel_loongson3a_2x2.S} | 2 +- param.h | 10 +- 7 files changed, 1499 insertions(+), 9 deletions(-) create mode 100644 kernel/mips64/cgemm_kernel_loongson3a_2x2.S rename kernel/mips64/{gemm_kernel_loongson3a.S => dgemm_kernel_loongson3a_4x4.S} (100%) rename kernel/mips64/{sgemm_kernel_loongson3a.S => sgemm_kernel_loongson3a_4x4.S} (100%) rename kernel/mips64/{zgemm_kernel_loongson3a.S => zgemm_kernel_loongson3a_2x2.S} (100%) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index a14b1cb38..6afb2cf13 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -123,10 +123,21 @@ ifndef DTRSMKERNEL_RT DTRSMKERNEL_RT = trsm_kernel_RT.S endif +ifndef CTRSMKERNEL_LN CTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_LT CTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RN CTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RT CTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif ifndef ZTRSMKERNEL_LN ZTRSMKERNEL_LN = ztrsm_kernel_LT.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 706f48128..ebab8e6ea 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,19 +1,25 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S -SGEMMKERNEL = sgemm_kernel_loongson3a.S +SGEMMKERNEL = sgemm_kernel_loongson3a_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = gemm_kernel_loongson3a.S +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -ZGEMMKERNEL = zgemm_kernel_loongson3a.S +CGEMMKERNEL = cgemm_kernel_loongson3a_2x2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o @@ -29,6 +35,11 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips64/cgemm_kernel_loongson3a_2x2.S b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S new file mode 100644 index 000000000..5ded7aed0 --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S @@ -0,0 +1,1468 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) + + MOV c41, c11 + MOV c42, c11 + FETCH $0, 4 * SIZE(CO2) + + MOV c43, c11 + NOP + FETCH $0, 4 * SIZE(CO1) + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddiu AO, AO, 16 * SIZE + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + daddu PREB, PREB, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + NOP + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD1 c31, c31, a7, b7 # A1xB2 + NOP + + MADD3 c33, c33, a7, b8 + daddiu PREB, PREB, 16 * SIZE + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L36 + NOP + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu PREA, PREA, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + +# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + +# gsLQC1(R13, F13, F12, 2) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + +# gsLQC1(R12, F11, F10, 3) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + daddiu L, L, -1 + +# gsLQC1(R13, F16, F15, 3) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/kernel/mips64/gemm_kernel_loongson3a.S b/kernel/mips64/dgemm_kernel_loongson3a_4x4.S similarity index 100% rename from kernel/mips64/gemm_kernel_loongson3a.S rename to kernel/mips64/dgemm_kernel_loongson3a_4x4.S diff --git a/kernel/mips64/sgemm_kernel_loongson3a.S b/kernel/mips64/sgemm_kernel_loongson3a_4x4.S similarity index 100% rename from kernel/mips64/sgemm_kernel_loongson3a.S rename to kernel/mips64/sgemm_kernel_loongson3a_4x4.S diff --git a/kernel/mips64/zgemm_kernel_loongson3a.S b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S similarity index 100% rename from kernel/mips64/zgemm_kernel_loongson3a.S rename to kernel/mips64/zgemm_kernel_loongson3a_2x2.S index 4cc396614..a8faad2f6 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a.S +++ b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S @@ -1065,8 +1065,8 @@ daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 diff --git a/param.h b/param.h index cab3e68dd..fd399a96f 100644 --- a/param.h +++ b/param.h @@ -1486,25 +1486,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 32 -#define CGEMM_DEFAULT_P 108 +#define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 116 #define DGEMM_DEFAULT_Q 116 -#define CGEMM_DEFAULT_Q 144 +#define CGEMM_DEFAULT_Q 100 #define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1000 #define DGEMM_DEFAULT_R 1000 -#define CGEMM_DEFAULT_R 2000 +#define CGEMM_DEFAULT_R 1000 #define ZGEMM_DEFAULT_R 1000 #define SYMV_P 16 From 2e8cdd15423a98d5f8b8efd5e4dd66ff9364d343 Mon Sep 17 00:00:00 2001 From: traz Date: Tue, 30 Aug 2011 20:54:19 +0000 Subject: [PATCH 06/52] Using ps instruction. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 632 ++++++++++++++++++++++++++++ 1 file changed, 632 insertions(+) create mode 100644 kernel/mips64/sgemm_kernel_8x4_ps.S diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S new file mode 100644 index 000000000..075957038 --- /dev/null +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -0,0 +1,632 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define FETCH ld +#define STACKSIZE 192 + +##### Parameter registers #### + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#### Pointer A, B, C #### +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define PREA $18 +#define PREB $19 + +#### Used registers #### +#define A1 $f0 +#define A2 $f1 +#define A3 $f2 +#define A4 $f3 +#define A5 $f4 +#define A6 $f5 +#define A7 $f6 +#define A8 $f7 + +#define B1 $f8 +#define B2 $f9 +#define B3 $f10 +#define B4 $f11 +#define B5 $f12 +#define B6 $f13 +#define B7 $f14 +#define B8 $f15 + +#define C11 $f16 +#define C12 $f17 +#define C21 $f18 +#define C22 $f19 +#define C31 $f20 +#define C32 $f21 +#define C41 $f22 +#define C42 $f23 +#define C13 $f24 +#define C14 $f25 +#define C23 $f26 +#define C24 $f27 +#define C33 $f28 +#define C34 $f29 +#define C43 $f30 +#define C44 $f31 + +#define I $2 +#define J $3 +#define L $7 + +#### Alpha register #### +#define ALPHA $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + +#define R12 12 +#define R13 13 + +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + + #.text +#.align 2 +# .globl REALNAME +# .set nomips16 +# .ent REALNAME +# .type REALNAME, @function +#REALNAME: +# .frame $fp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 +# .mask 0x40000000,-8 +# .fmask 0x00000000,0 +# .set noreorder +# .set nomacro + + + PROLOGUE + + daddiu $sp,$sp,-STACKSIZE + sd $fp,184($sp) + move $fp,$sp + + sd $16, 0($fp) + sd $17, 8($fp) + sd $18, 16($fp) + sd $19, 24($fp) + sd $20, 32($fp) + sd $21, 40($fp) + sd $22, 48($fp) + + ST $f24, 56($fp) + ST $f25, 64($fp) + ST $f26, 72($fp) + ST $f27, 80($fp) + ST $f28, 88($fp) + +#if defined(TRMMKERNEL) + sd $23, 96($fp) + sd $24, 104($fp) + sd $25, 112($fp) +#endif + +#ifndef __64BIT__ + ST $f20,120($fp) + ST $f21,128($fp) + ST $f22,136($fp) + ST $f23,144($fp) +#endif + + .align 4 +.L4: + dsra J, N, 2 # NR=4 + dsll LDC, LDC, BASE_SHIFT# LDC*SIZE + + ST ALPHA, 152($fp) # Store alpha + blez J, .L2 + NOP + + +.L48: + dsra I, M, 3 # MR=8 + dsll PREA, K, BASE_SHIFT + + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + daddu CO3, CO2, LDC + + daddu CO4, CO3, LDC + daddu PREA, A, PREA + + blez I, .L44 + daddu C, CO4, LDC + + .align 4 +.L488: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=8 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + FETCH $0, 4 * SIZE(CO3) + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L484 + FETCH $0, 0 * SIZE(CO4) + +.L4880: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + FETCH $0, 0 * SIZE(PREA) + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + FETCH $0, 0 * SIZE(PREB) + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + FETCH $0, 4 * SIZE(PREA) + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + FETCH $0, 4 * SIZE(PREB) + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + FETCH $0, 8 * SIZE(PREA) + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + FETCH $0, 12 * SIZE(PREA) + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + FETCH $0, 16 * SIZE(PREA) + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + FETCH $0, 20 * SIZE(PREA) + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + FETCH $0, 8 * SIZE(PREB) + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + FETCH $0, 12 * SIZE(PREB) + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + FETCH $0, 24 * SIZE(PREA) + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + FETCH $0, 28 * SIZE(PREA) + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + daddiu PREB, PREB, 16 * SIZE + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + bgtz L, .L4880 + MADPS C44, C44, A8, B8 + + .align 4 +.L484: + andi L, K, 4 + blez L, .L482 + NOP + + .align 4 +.L482: + andi L, K, 2 + blez L, .L481 + NOP + + .align 4 +.L481: + andi L, K, 1 + blez L, .L480 + NOP + + .align 4 +.L480: # Write Back + daddiu I, I, -1 + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + + CVTU A3, C23 # A3=C23.upper=c14 + LD B1, 1 * SIZE(CO1) + + CVTU A4, C21 # A4=C21.upper=c24 + LD B2, 1 * SIZE(CO2) + + CVTU A5, C33 # A5=C33.upper=c16 + LD B3, 3 * SIZE(CO1) + + CVTU A6, C31 # A6=C31.upper=c26 + LD B4, 3 * SIZE(CO2) + + CVTU A7, C43 # A7=C43.upper=c18 + LD B5, 5 * SIZE(CO1) + + CVTU A8, C41 # A8=C41.upper=c28 + LD B6, 5 * SIZE(CO2) + + MADD A1, B1, A1, ALPHA # c12 + LD B7, 7 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B1, 7 * SIZE(CO2) + + MADD A3, B3, A3, ALPHA # c14 + LD B2, 0 * SIZE(CO1) + + MADD A4, B4, A4, ALPHA # c24 + LD B3, 0 * SIZE(CO2) + + MADD A5, B5, A5, ALPHA # c16 + LD B4, 2 * SIZE(CO1) + + MADD A6, B6, A6, ALPHA # c26 + LD B5, 2 * SIZE(CO2) + + MADD A7, B7, A7, ALPHA # c18 + LD B6, 4 * SIZE(CO1) + ST A1, 1 * SIZE(CO1) + + MADD A8, B1, A8, ALPHA # c28 + LD B7, 4 * SIZE(CO2) + ST A2, 1 * SIZE(CO2) + + MADD C11, B2, C11, ALPHA # c12 + LD A1, 6 * SIZE(CO1) + ST A3, 3 * SIZE(CO1) + + MADD C13, B3, C13, ALPHA # c22 + LD A2, 6 * SIZE(CO2) + ST A4, 3 * SIZE(CO2) + + MADD C21, B4, C21, ALPHA # c14 + ST A5, 5 * SIZE(CO1) + + MADD C23, B5, C23, ALPHA # c24 + ST A6, 5 * SIZE(CO2) + + MADD C31, B6, C31, ALPHA # c16 + ST A7, 7 * SIZE(CO1) + + MADD C33, B7, C33, ALPHA # c26 + ST A8, 7 * SIZE(CO2) + + MADD C41, A1, C41, ALPHA # c18 + ST C11, 0 * SIZE(CO1) + + MADD C43, A2, C43, ALPHA # c28 + ST C13, 0 * SIZE(CO2) + + ST C21, 2 * SIZE(CO1) + ST C23, 2 * SIZE(CO2) + ST C31, 4 * SIZE(CO1) + ST C33, 4 * SIZE(CO2) + ST C41, 6 * SIZE(CO1) + + CVTU A1, C14 # B1=C12.upper=c42 + ST C43, 6 * SIZE(CO2) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B1, 1 * SIZE(CO3) + + CVTU A3, C24 # B3=C22.upper=c44 + LD B2, 1 * SIZE(CO4) + + CVTU A4, C22 # B4=C24.upper=c34 + LD B3, 3 * SIZE(CO3) + + CVTU A5, C34 # B5=C32.upper=c46 + LD B4, 3 * SIZE(CO4) + + CVTU A6, C32 # B6=C24.upper=c36 + LD B5, 5 * SIZE(CO3) + + CVTU A7, C44 # B7=C42.upper=c48 + LD B6, 5 * SIZE(CO4) + + CVTU A8, C42 # A1=C44.upper=c38 + LD B7, 7 * SIZE(CO3) + + MADD A1, B1, A1, ALPHA # c31 + LD C11, 7 * SIZE(CO4) + + MADD A2, B2, A2, ALPHA + LD C13, 0 * SIZE(CO3) + + MADD A3, B3, A3, ALPHA + LD C21, 0 * SIZE(CO4) + + MADD A4, B4, A4, ALPHA + LD C23, 2 * SIZE(CO3) + + MADD A5, B5, A5, ALPHA + LD C31, 2 * SIZE(CO4) + + MADD A6, B6, A6, ALPHA + LD C33, 4 * SIZE(CO3) + + MADD A7, B7, A7, ALPHA + LD C41, 4 * SIZE(CO4) + ST A1, 1 * SIZE(CO3) + + MADD A8, C11, A8, ALPHA + LD C43, 6 * SIZE(CO3) + ST A2, 1 * SIZE(CO4) + + MADD C12, C13, C12, ALPHA + LD B1, 6 * SIZE(CO4) + ST A3, 3 * SIZE(CO3) + + MADD C14, C21, C14, ALPHA + ST A4, 3 * SIZE(CO4) + + MADD C22, C23, C22, ALPHA + ST A5, 5 * SIZE(CO3) + + MADD C24, C31, C24, ALPHA + ST A6, 5 * SIZE(CO4) + + MADD C32, C33, C32, ALPHA + ST A7, 7 * SIZE(CO3) + + MADD C34, C41, C34, ALPHA + ST A8, 7 * SIZE(CO4) + + MADD C42, C43, C42, ALPHA + ST C12, 0 * SIZE(CO3) + + MADD C44, B1, C44, ALPHA + ST C14, 0 * SIZE(CO4) + + ST C22, 2 * SIZE(CO3) + daddiu CO1, CO1, 8 * SIZE + + ST C24, 2 * SIZE(CO4) + daddiu CO2, CO2, 8 * SIZE + + ST C32, 4 * SIZE(CO3) + ST C34, 4 * SIZE(CO4) + ST C42, 6 * SIZE(CO3) + ST C44, 6 * SIZE(CO4) + + daddiu CO3, CO3, 8 * SIZE + bgtz I, .L488 + daddiu CO4, CO4, 8 * SIZE + +.L44: + +.L40: + daddiu J, J, -1 + move B, BO + + bgtz J, .L48 + NOP + + .align 4 +.L2: # Nr=2 + andi J, N, 2 + blez J, .L1 + NOP + + + + .align 4 +.L1: + andi J, N, 1 + blez J, .L999 + NOP + + + +.L999: + ld $16, 0($fp) + ld $17, 8($fp) + ld $18, 16($fp) + ld $19, 24($fp) + ld $20, 32($fp) + ld $21, 40($fp) + ld $22, 48($fp) + + LD $f24, 56($fp) + LD $f25, 64($fp) + LD $f26, 72($fp) + LD $f27, 80($fp) + LD $f28, 88($fp) + +#if defined(TRMMKERNEL) + ld $23, 96($fp) + ld $24, 104($fp) + ld $25, 112($fp) +#endif + +#ifndef __64BIT__ + LD $f20,120($fp) + LD $f21,128($fp) + LD $f22,136($fp) + LD $f23,144($fp) +#endif + + move $sp,$fp + ld $fp,184($sp) + daddiu $sp,$sp,STACKSIZE + j $31 + nop + + EPILOGUE +# .set macro +# .set reorder +# .end REALNAME +# .size REALNAME, .-REALNAME +#.ident "GCC: (Debian 4.4.6-6) 4.4.6" From cb0214787b361a6e1f8ac0d1a423d4a95b474832 Mon Sep 17 00:00:00 2001 From: traz Date: Tue, 30 Aug 2011 20:57:00 +0000 Subject: [PATCH 07/52] Modify compile options. --- common_mips64.h | 9 ++++++++- kernel/mips64/KERNEL.LOONGSON3A | 10 +++++++--- param.h | 21 +++++++++++++++++---- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/common_mips64.h b/common_mips64.h index acea79011..2aa325bfa 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -170,6 +170,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.s #define CMPLE c.le.s #define CMPLT c.lt.s +#define PLU plu.ps +#define PLL pll.ps +#define PUU puu.ps +#define PUL pul.ps +#define MADPS madd.ps +#define CVTU cvt.s.pu +#define CVTL cvt.s.pl #endif #if defined(__64BIT__) && defined(USE64BITINT) @@ -218,7 +225,7 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 8 << 20) +#define BUFFER_SIZE ( 32 << 20) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index ebab8e6ea..4a195f265 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,9 +1,13 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S -SGEMMKERNEL = sgemm_kernel_loongson3a_4x4.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o diff --git a/param.h b/param.h index fd399a96f..2c3021710 100644 --- a/param.h +++ b/param.h @@ -1480,7 +1480,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 @@ -1497,16 +1497,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 -#define SGEMM_DEFAULT_Q 116 -#define DGEMM_DEFAULT_Q 116 +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 112 #define CGEMM_DEFAULT_Q 100 #define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 1000 +#define SGEMM_DEFAULT_R 1024 +//#define DGEMM_DEFAULT_R 300 +//#define DGEMM_DEFAULT_R 200 +//#define DGEMM_DEFAULT_R 400 +//#define DGEMM_DEFAULT_R 192 #define DGEMM_DEFAULT_R 1000 +//#define DGEMM_DEFAULT_R 160 +//#define DGEMM_DEFAULT_R 270 #define CGEMM_DEFAULT_R 1000 +//#define ZGEMM_DEFAULT_R 1000 #define ZGEMM_DEFAULT_R 1000 +#define GEMM_OFFSET_A1 (DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*SIZE) +//#define GEMM_OFFSET_B1 0x10 +#define GEMM_OFFSET_B1 (DGEMM_DEFAULT_Q*DGEMM_DEFAULT_R*SIZE) +#define GEMM_OFFSET 0x100000 +#define GEMM_OFFSET1 0x40000 + #define SYMV_P 16 #endif From 09f49fa891a7351abdcf6db95a45c6d6780b69e0 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 31 Aug 2011 21:24:03 +0000 Subject: [PATCH 08/52] Using PS instructions to improve the performance of sgemm and it is 4.2Gflops now. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 6041 ++++++++++++++++++++++++++- 1 file changed, 5951 insertions(+), 90 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 075957038..02a0833dd 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -2,13 +2,12 @@ #define ASSEMBLER #include "common.h" +#define FETCH ld +#define STACKSIZE 192 #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) -#define FETCH ld -#define STACKSIZE 192 - ##### Parameter registers #### #define M $4 @@ -115,13 +114,13 @@ #define R16 16 #define R17 17 - #.text -#.align 2 -# .globl REALNAME +# .text +# .align 2 +## .globl gemm # .set nomips16 -# .ent REALNAME -# .type REALNAME, @function -#REALNAME: +# .ent gemm +# .type gemm, @function +#gemm: # .frame $fp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 # .mask 0x40000000,-8 # .fmask 0x00000000,0 @@ -166,11 +165,8 @@ .L4: dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE - - ST ALPHA, 152($fp) # Store alpha blez J, .L2 - NOP - + ST ALPHA, 152($fp) .L48: dsra I, M, 3 # MR=8 @@ -189,9 +185,9 @@ daddu C, CO4, LDC .align 4 -.L488: +.L481: move BO, B # Reset B - dsra L, K, 2 # UnRoll K=8 + dsra L, K, 6 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 @@ -233,10 +229,10 @@ FETCH $0, 0 * SIZE(CO4) PLU B4, B2, B2 - blez L, .L484 - FETCH $0, 0 * SIZE(CO4) + blez L, .L482 + FETCH $0, 4 * SIZE(CO4) -.L4880: +.L4810: daddiu L, L, -1 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -252,21 +248,21 @@ MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 - FETCH $0, 0 * SIZE(PREA) MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 - FETCH $0, 0 * SIZE(PREB) MADPS C14, C14, A1, B4 PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) - FETCH $0, 4 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -285,21 +281,21 @@ MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 - FETCH $0, 4 * SIZE(PREB) + MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - FETCH $0, 8 * SIZE(PREA) MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 - FETCH $0, 12 * SIZE(PREA) MADPS C14, C14, A5, B8 PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 @@ -316,25 +312,25 @@ MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 - FETCH $0, 16 * SIZE(PREA) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR - FETCH $0, 20 * SIZE(PREA) MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 - FETCH $0, 8 * SIZE(PREB) MADPS C14, C14, A1, B4 PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -353,45 +349,4246 @@ MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 - FETCH $0, 12 * SIZE(PREB) + MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE - FETCH $0, 24 * SIZE(PREA) MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 - FETCH $0, 28 * SIZE(PREA) MADPS C14, C14, A5, B8 PLU B3, B1, B1 - daddiu PREB, PREB, 16 * SIZE + FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE - + MADPS C34, C34, A7, B8 - bgtz L, .L4880 MADPS C44, C44, A8, B8 - .align 4 -.L484: - andi L, K, 4 - blez L, .L482 - NOP - + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + bgtz L, .L4810 + MADPS C44, C44, A8, B8 + .align 4 .L482: - andi L, K, 2 - blez L, .L481 + andi L, K, 32 + blez L, .L483 NOP + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + .align 4 -.L481: +.L483: + andi L, K, 16 + blez L, .L484 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L484: + andi L, K, 8 + blez L, .L485 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L485: + andi L, K, 4 + blez L, .L486 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L486: + andi L, K, 2 + blez L, .L487 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 8 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 16 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 8 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + daddiu PREA, PREA, 16 * SIZE + + + .align 4 +.L487: andi L, K, 1 blez L, .L480 - NOP + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 4 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 8 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + .align 4 .L480: # Write Back @@ -417,56 +4614,57 @@ CVTU A8, C41 # A8=C41.upper=c28 LD B6, 5 * SIZE(CO2) - MADD A1, B1, A1, ALPHA # c12 + MADD A1, B1, A1, ALPHA # c12 LD B7, 7 * SIZE(CO1) - MADD A2, B2, A2, ALPHA # c22 + MADD A2, B2, A2, ALPHA # c22 LD B1, 7 * SIZE(CO2) - MADD A3, B3, A3, ALPHA # c14 + MADD A3, B3, A3, ALPHA # c14 LD B2, 0 * SIZE(CO1) - MADD A4, B4, A4, ALPHA # c24 + MADD A4, B4, A4, ALPHA # c24 LD B3, 0 * SIZE(CO2) - MADD A5, B5, A5, ALPHA # c16 + MADD A5, B5, A5, ALPHA # c16 LD B4, 2 * SIZE(CO1) - MADD A6, B6, A6, ALPHA # c26 + MADD A6, B6, A6, ALPHA # c26 LD B5, 2 * SIZE(CO2) - MADD A7, B7, A7, ALPHA # c18 + MADD A7, B7, A7, ALPHA # c18 LD B6, 4 * SIZE(CO1) + + MADD A8, B1, A8, ALPHA # c28 ST A1, 1 * SIZE(CO1) - MADD A8, B1, A8, ALPHA # c28 + MADD C11, B2, C11, ALPHA # c12 LD B7, 4 * SIZE(CO2) + + MADD C13, B3, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) - MADD C11, B2, C11, ALPHA # c12 + MADD C21, B4, C21, ALPHA # c14 LD A1, 6 * SIZE(CO1) + + MADD C23, B5, C23, ALPHA # c24 ST A3, 3 * SIZE(CO1) - MADD C13, B3, C13, ALPHA # c22 + MADD C31, B6, C31, ALPHA # c16 LD A2, 6 * SIZE(CO2) + + MADD C33, B7, C33, ALPHA # c26 ST A4, 3 * SIZE(CO2) - MADD C21, B4, C21, ALPHA # c14 ST A5, 5 * SIZE(CO1) - - MADD C23, B5, C23, ALPHA # c24 ST A6, 5 * SIZE(CO2) - - MADD C31, B6, C31, ALPHA # c16 ST A7, 7 * SIZE(CO1) - - MADD C33, B7, C33, ALPHA # c26 ST A8, 7 * SIZE(CO2) - MADD C41, A1, C41, ALPHA # c18 + MADD C41, A1, C41, ALPHA # c18 ST C11, 0 * SIZE(CO1) - MADD C43, A2, C43, ALPHA # c28 + MADD C43, A2, C43, ALPHA # c28 ST C13, 0 * SIZE(CO2) ST C21, 2 * SIZE(CO1) @@ -499,87 +4697,1327 @@ CVTU A8, C42 # A1=C44.upper=c38 LD B7, 7 * SIZE(CO3) - MADD A1, B1, A1, ALPHA # c31 + MADD A1, B1, A1, ALPHA # c31 LD C11, 7 * SIZE(CO4) - MADD A2, B2, A2, ALPHA + MADD A2, B2, A2, ALPHA LD C13, 0 * SIZE(CO3) - MADD A3, B3, A3, ALPHA + MADD A3, B3, A3, ALPHA LD C21, 0 * SIZE(CO4) - MADD A4, B4, A4, ALPHA + MADD A4, B4, A4, ALPHA LD C23, 2 * SIZE(CO3) - MADD A5, B5, A5, ALPHA + MADD A5, B5, A5, ALPHA LD C31, 2 * SIZE(CO4) - MADD A6, B6, A6, ALPHA + MADD A6, B6, A6, ALPHA LD C33, 4 * SIZE(CO3) - MADD A7, B7, A7, ALPHA + MADD A7, B7, A7, ALPHA LD C41, 4 * SIZE(CO4) + + MADD A8, C11, A8, ALPHA ST A1, 1 * SIZE(CO3) - MADD A8, C11, A8, ALPHA + MADD C12, C13, C12, ALPHA LD C43, 6 * SIZE(CO3) + + MADD C14, C21, C14, ALPHA ST A2, 1 * SIZE(CO4) - MADD C12, C13, C12, ALPHA + MADD C22, C23, C22, ALPHA LD B1, 6 * SIZE(CO4) + + MADD C24, C31, C24, ALPHA ST A3, 3 * SIZE(CO3) - MADD C14, C21, C14, ALPHA + MADD C32, C33, C32, ALPHA ST A4, 3 * SIZE(CO4) - MADD C22, C23, C22, ALPHA + MADD C34, C41, C34, ALPHA ST A5, 5 * SIZE(CO3) - MADD C24, C31, C24, ALPHA + MADD C42, C43, C42, ALPHA ST A6, 5 * SIZE(CO4) - MADD C32, C33, C32, ALPHA ST A7, 7 * SIZE(CO3) + NOP - MADD C34, C41, C34, ALPHA + MADD C44, B1, C44, ALPHA ST A8, 7 * SIZE(CO4) - MADD C42, C43, C42, ALPHA ST C12, 0 * SIZE(CO3) - - MADD C44, B1, C44, ALPHA ST C14, 0 * SIZE(CO4) - ST C22, 2 * SIZE(CO3) - daddiu CO1, CO1, 8 * SIZE - ST C24, 2 * SIZE(CO4) - daddiu CO2, CO2, 8 * SIZE - ST C32, 4 * SIZE(CO3) ST C34, 4 * SIZE(CO4) ST C42, 6 * SIZE(CO3) ST C44, 6 * SIZE(CO4) + daddiu CO1, CO1, 8 * SIZE + daddiu CO2, CO2, 8 * SIZE daddiu CO3, CO3, 8 * SIZE - bgtz I, .L488 + bgtz I, .L481 daddiu CO4, CO4, 8 * SIZE -.L44: + .align 4 +.L44: + andi I, M, 4 # MR=4 + blez I, .L42 + NOP + + .align 4 +.L441: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L442 + NOP + +.L4410: # + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A5, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C21, C21, A6, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C12, C12, A5, B2 + FETCH $0, 8 * SIZE(PREB) + daddiu BO, BO, 16 * SIZE # 4KR*4NR + + MADPS C22, C22, A6, B2 + FETCH $0, 8 * SIZE(PREA) + daddiu AO, AO, 16 * SIZE # 4KR*4MR + + MADPS C13, C13, A5, B3 + MADPS C23, C23, A6, B3 + + MADPS C14, C14, A5, B4 + MADPS C24, C24, A6, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A7, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A8, B5 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C12, C12, A7, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C22, C22, A8, B6 + FETCH $0, 12 * SIZE(PREA) + + MADPS C13, C13, A7, B7 + daddiu PREA, PREA, 16 * SIZE + MADPS C23, C23, A8, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C14, C14, A7, B8 + MADPS C24, C24, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L4410 + PLU B4, B2, B2 + + .align 4 +.L442: + andi L, K, 2 + blez L, .L443 + NOP + + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + daddiu BO, BO, 8 * SIZE # 2KR*4NR + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + daddiu AO, AO, 8 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F1, F0, 0) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + daddiu PREB, PREB, 8 + MADPS C23, C23, A4, B7 + daddiu PREA, PREA, 8 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L443: + andi L, K, 1 + blez L, .L440 + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 1KR*4NR + MADPS C22, C22, A2, B2 + daddiu AO, AO, 4 * SIZE # 1KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + + .align 4 +.L440: + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + CVTU A3, C23 # A3=C23.upper=c14 + LD B3, 3 * SIZE(CO1) + + CVTU A4, C21 # A4=C21.upper=c24 + LD B4, 3 * SIZE(CO2) + + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD A3, B3, A3, ALPHA # c14 + LD B7, 2 * SIZE(CO1) + + MADD A4, B4, A4, ALPHA # c24 + LD B1, 2 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MADD C21, B7, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MADD C23, B1, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C21, 2 * SIZE(CO1) + ST C23, 2 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + CVTU A3, C24 # B3=C22.upper=c44 + LD B3, 3 * SIZE(CO3) + + CVTU A4, C22 # B4=C24.upper=c34 + LD B4, 3 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD A3, B3, A3, ALPHA + LD A7, 2 * SIZE(CO3) + + MADD A4, B4, A4, ALPHA + LD A8, 2 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MADD C22, A7, C22, ALPHA + ST A3, 3 * SIZE(CO3) + + MADD C24, A8, C24, ALPHA + ST A4, 3 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + + .align 4 +.L42: + andi I, M, 2 + blez I, .L41 + NOP + + .align 4 +.L421: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L422 + NOP + +.L4210: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + gsLQC1(R12, F3, F2, 1) # B1 B2 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + daddiu AO, AO, 8 * SIZE # 4KR*2MR + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A3, B1 + gsLQC1(R12, F1, F0, 0) # B3 B4 + + MADPS C12, C12, A3, B2 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C13, C13, A3, B3 + MADPS C14, C14, A3, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A4, B5 + MADPS C12, C12, A4, B6 + gsLQC1(R13, F9, F8, 0) # B3 B4 + + MADPS C13, C13, A4, B7 + MADPS C14, C14, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L4210 + PLU B4, B2, B2 + + .align 4 +.L422: + andi L, K, 2 + blez L, .L423 + NOP + + daddiu AO, AO, 4 * SIZE # 2KR*2MR + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + daddiu BO, BO, 8 * SIZE # 2KR*2MR + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + gsLQC1(R12, F1, F0, 0) + + PLU B3, B1, B1 + PLU B4, B2, B2 + +.L423: + andi L, K, 1 + blez L, .L420 + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 2KR*4NR + daddiu AO, AO, 2 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + .align 4 +.L420: + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + + .align 4 +.L41: + andi I, M, 1 + blez I, .L40 + NOP + + .align 4 +.L411: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD B1, 0 * SIZE(BO) + + MOV C21, C11 + MOV C22, C11 + LD A1, 0 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B2, 1 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B3, 2 * SIZE(BO) + + MOV C13, C11 + MOV C14, C11 + LD B4, 3 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L412 + MOV C44, C11 + +.L4110: + daddiu L, L, -1 + LD A2, 1 * SIZE(AO) + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A3, 2 * SIZE(AO) + NOP + + MADD C11, C11, A2, B5 + LD B1, 8 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 9 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 10 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 11 * SIZE(BO) + + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B1 + LD B5, 12 * SIZE(BO) + + MADD C12, C12, A3, B2 + LD B6, 13 * SIZE(BO) + + MADD C13, C13, A3, B3 + LD B7, 14 * SIZE(BO) + + MADD C14, C14, A3, B4 + LD B8, 15 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD C11, C11, A4, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A4, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A4, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A4, B8 + bgtz L, .L4110 + LD B4, 3 * SIZE(BO) + +.L412: + andi L, K, 2 + blez L, .L413 + NOP + + LD A2, 1 * SIZE(AO) + daddiu AO, AO, 2 * SIZE + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 8 * SIZE + + MADD C11, C11, A2, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 3 * SIZE(BO) + +.L413: + andi L, K, 1 + blez L, .L410 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + MADD C13, C13, A1, B3 + MADD C14, C14, A1, B4 + daddiu BO, BO, 4 * SIZE + + .align 4 +.L410: + LD A5, 0 * SIZE(CO1) + LD A6, 0 * SIZE(CO2) + LD A7, 0 * SIZE(CO3) + LD A8, 0 * SIZE(CO4) + + MADD A5, A5, C11, ALPHA + MADD A6, A6, C12, ALPHA + MADD A7, A7, C13, ALPHA + MADD A8, A8, C14, ALPHA + + ST A5, 0 * SIZE(CO1) + ST A6, 0 * SIZE(CO2) + ST A7, 0 * SIZE(CO3) + ST A8, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + .align 4 .L40: daddiu J, J, -1 move B, BO - bgtz J, .L48 NOP + + .align 4 .L2: # Nr=2 andi J, N, 2 blez J, .L1 NOP +.L28: + dsra I, M, 3 # MR=8 + + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + blez I, .L24 + daddu C, CO2, LDC + + + .align 4 +.L281: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C13, C11 + LD A7, 6 * SIZE(AO) + + MOV C14, C11 + LD A8, 7 * SIZE(AO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L282 + MOV C44, C11 + + + .align 4 +.L2810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + LD B3, 2 * SIZE(BO) + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + LD A2, 13 * SIZE(AO) + + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + LD A3, 14 * SIZE(AO) + + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B3 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B3 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B3 + MADD C23, C23, A2, B3 + LD A7, 6 * SIZE(AO) + + MADD C33, C33, A3, B3 + MADD C43, C43, A4, B3 + LD A8, 7 * SIZE(AO) + + MADD C14, C14, A1, B4 + MADD C24, C24, A2, B4 + LD B1, 0 * SIZE(BO) + + MADD C34, C34, A3, B4 + MADD C44, C44, A4, B4 + LD B2, 1 * SIZE(BO) + + MADD C31, C31, B7, B3 + MADD C41, C41, B8, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, B5, B4 + LD A2, 1 * SIZE(AO) + + MADD C22, C22, B6, B4 + LD A3, 2 * SIZE(AO) + + LD A4, 3 * SIZE(AO) + MADD C32, C32, B7, B4 + bgtz L, .L2810 + MADD C42, C42, B8, B4 + + .align 4 +.L282: + andi L, K, 1 + blez L, .L280 + LD ALPHA, 152($fp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L280: # Write Back + daddiu I, I, -1 + + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + LD A5, 4 * SIZE(CO1) + LD A6, 5 * SIZE(CO1) + LD A7, 6 * SIZE(CO1) + LD A8, 7 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD A5, A5, C13, ALPHA + LD B5, 4 * SIZE(CO2) + + MADD A6, A6, C23, ALPHA + LD B6, 5 * SIZE(CO2) + + MADD A7, A7, C33, ALPHA + LD B7, 6 * SIZE(CO2) + + MADD A8, A8, C43, ALPHA + LD C11, 7 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + MADD B5, B5, C14, ALPHA + ST A5, 4 * SIZE(CO1) + + MADD B6, B6, C24, ALPHA + ST A6, 5 * SIZE(CO1) + + MADD B7, B7, C34, ALPHA + ST A7, 6 * SIZE(CO1) + + MADD C11, C11, C44, ALPHA + ST A8, 7 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + ST B5, 4 * SIZE(CO2) + ST B6, 5 * SIZE(CO2) + ST B7, 6 * SIZE(CO2) + ST C11, 7 * SIZE(CO2) + + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L281 + daddiu CO2, CO2, 8 * SIZE + + + .align 4 +.L24: + andi I, M, 4 # MR=4 + blez I, .L22 + NOP + + .align 4 +.L241: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + LD B2, 1 * SIZE(BO) + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L242 + MOV C44, C11 + + + .align 4 +.L2410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD B4, 3 * SIZE(BO) + + MADD C41, C41, A4, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 4 * SIZE + + MADD C12, C12, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C22, C22, A2, B2 + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD B2, 1 * SIZE(BO) + + MADD C41, C41, A8, B3 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A5, B4 + LD A3, 2 * SIZE(AO) + + MADD C22, C22, A6, B4 + LD A4, 3 * SIZE(AO) + + MADD C32, C32, A7, B4 + bgtz L, .L2410 + MADD C42, C42, A8, B4 + + .align 4 +.L242: + andi L, K, 1 + blez L, .L240 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L240: # Write Back + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + .align 4 +.L22: + andi I, M, 2 + blez I, .L21 + NOP + + .align 4 +.L221: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L222 + MOV C44, C11 + + + .align 4 +.L2210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A3, 2 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C22, C22, A2, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A4, B3 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A3, B4 + LD B2, 1 * SIZE(BO) + + MADD C22, C22, A4, B4 + bgtz L, .L2210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L222: + andi L, K, 1 + blez L, .L220 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L220: # Write Back + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + + .align 4 +.L21: + andi I, M, 1 + blez I, .L20 + NOP + + .align 4 +.L211: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L212 + MOV C44, C11 + + + .align 4 +.L2110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A1, B2 + LD B3, 2 * SIZE(BO) + + LD B4, 3 * SIZE(BO) + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A2, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, A2, B4 + LD B1, 0 * SIZE(BO) + + bgtz L, .L2110 + LD B2, 1 * SIZE(BO) + + + .align 4 +.L212: + andi L, K, 1 + blez L, .L210 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L210: # Write Back + LD A1, 0 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + + + .align 4 +.L20: + move B, BO + NOP + .align 4 @@ -588,6 +6026,429 @@ blez J, .L999 NOP +.L18: + dsra I, M, 3 # MR=8 + move AO, A # Reset A + blez I, .L14 + NOP + + + .align 4 +.L181: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD A7, 6 * SIZE(AO) + + MOV C13, C11 + LD A8, 7 * SIZE(AO) + + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L182 + MOV C44, C11 + + + .align 4 +.L1810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C13, C13, A5, B1 + LD B2, 1 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + LD A2, 13 * SIZE(AO) + + MADD C43, C43, A8, B1 + LD A3, 14 * SIZE(AO) + + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B2 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B2 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C23, C23, A2, B2 + LD A8, 7 * SIZE(AO) + + MADD C33, C33, A3, B2 + LD B1, 0 * SIZE(BO) + + MADD C43, C43, A4, B2 + LD A1, 0 * SIZE(AO) + + MADD C31, C31, B7, B2 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, B8, B2 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1810 + LD A4, 3 * SIZE(AO) + + .align 4 +.L182: + andi L, K, 1 + blez L, .L180 + LD ALPHA, 152($fp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L180: # Write Back + daddiu I, I, -1 + + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + LD A5, 4 * SIZE(C) + LD A6, 5 * SIZE(C) + LD A7, 6 * SIZE(C) + LD A8, 7 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + MADD A5, A5, C13, ALPHA + MADD A6, A6, C23, ALPHA + MADD A7, A7, C33, ALPHA + MADD A8, A8, C43, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + ST A5, 4 * SIZE(C) + ST A6, 5 * SIZE(C) + ST A7, 6 * SIZE(C) + ST A8, 7 * SIZE(C) + + daddiu C, C, 8 * SIZE + bgtz I, .L181 + NOP + + + .align 4 +.L14: + andi I, M, 4 # MR=4 + blez I, .L12 + NOP + + .align 4 +.L141: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L142 + MOV C44, C11 + + + .align 4 +.L1410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 1 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 2 * SIZE + + MADD C41, C41, A4, B1 + LD A7, 6 * SIZE(AO) + + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, A8, B3 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1410 + LD A4, 3 * SIZE(AO) + + .align 4 +.L142: + andi L, K, 1 + blez L, .L140 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L140: # Write Back + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + daddiu C, C, 4 * SIZE + + .align 4 +.L12: + andi I, M, 2 + blez I, .L11 + NOP + + .align 4 +.L121: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + + MOV C43, C11 + blez L, .L122 + MOV C44, C11 + + + .align 4 +.L1210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B3, 1 * SIZE(BO) + + MADD C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + + LD A3, 2 * SIZE(AO) + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD B1, 0 * SIZE(BO) + + MADD C21, C21, A4, B3 + LD A1, 0 * SIZE(AO) + bgtz L, .L1210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L122: + andi L, K, 1 + blez L, .L120 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L120: # Write Back + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + + daddiu C, C, 2 * SIZE + + + .align 4 +.L11: + andi I, M, 1 + blez I, .L10 + NOP + + .align 4 +.L111: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD B1, 0 * SIZE(BO) + + MOV C31, C11 + blez L, .L112 + MOV C32, C11 + + + + .align 4 +.L1110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + + LD A2, 1 * SIZE(AO) + LD B2, 1 * SIZE(BO) + + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + MADD C11, C11, A2, B2 + LD A1, 0 * SIZE(AO) + LD B1, 0 * SIZE(BO) + + bgtz L, .L1110 + NOP + + + .align 4 +.L112: + andi L, K, 1 + blez L, .L110 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L110: # Write Back + LD A1, 0 * SIZE(C) + + MADD A1, A1, C11, ALPHA + + ST A1, 0 * SIZE(C) + + daddiu C, C, 1 * SIZE + + + .align 4 +.L10: + move B, BO + NOP .L999: @@ -627,6 +6488,6 @@ EPILOGUE # .set macro # .set reorder -# .end REALNAME -# .size REALNAME, .-REALNAME -#.ident "GCC: (Debian 4.4.6-6) 4.4.6" +# .end gemm +# .size gemm, .-gemm +# .ident "GCC: (Debian 4.4.6-6) 4.4.6" From 74a3f634890d950ab67c2557232d49440a0d2e1c Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 1 Sep 2011 17:15:28 +0000 Subject: [PATCH 09/52] Tuning mb, kb, nb size to get the best performance. --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 2c3021710..ecdae2e67 100644 --- a/param.h +++ b/param.h @@ -1497,7 +1497,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 -#define SGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 112 #define CGEMM_DEFAULT_Q 100 #define ZGEMM_DEFAULT_Q 80 From a15bc9582485d4f5dab3adf7724488a41352047d Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 09:15:09 +0000 Subject: [PATCH 10/52] Add strmm part. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 1344 ++++++++++++++++++++++++++- 1 file changed, 1327 insertions(+), 17 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 02a0833dd..1b4dae892 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -114,6 +114,12 @@ #define R16 16 #define R17 17 +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + # .text # .align 2 ## .globl gemm @@ -165,6 +171,15 @@ .L4: dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE + +#if defined(TRMMKERNEL) + LD OFFSET, 192($fp) +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + blez J, .L2 ST ALPHA, 152($fp) @@ -181,11 +196,81 @@ daddu CO4, CO3, LDC daddu PREA, A, PREA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + blez I, .L44 daddu C, CO4, LDC .align 4 .L481: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 3 + BASE_SHIFT # kk*8mr*datasize + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L # AO point to the data addr + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + FETCH $0, 4 * SIZE(CO3) + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + FETCH $0, 4 * SIZE(CO4) + +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK # TEMP is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 6 + blez L, .L482 + NOP +#else + # GEMM PART move BO, B # Reset B dsra L, K, 6 # UnRoll K=64 @@ -231,6 +316,7 @@ PLU B4, B2, B2 blez L, .L482 FETCH $0, 4 * SIZE(CO4) +#endif .L4810: daddiu L, L, -1 @@ -2413,7 +2499,11 @@ .align 4 .L482: +#ifndef TRMMKERNEL andi L, K, 32 +#else + andi L, TEMP, 32 +#endif blez L, .L483 NOP @@ -3508,7 +3598,11 @@ .align 4 .L483: +#ifndef TRMMKERNEL andi L, K, 16 +#else + andi L, TEMP, 16 +#endif blez L, .L484 NOP @@ -4059,7 +4153,11 @@ .align 4 .L484: +#ifndef TRMMKERNEL andi L, K, 8 +#else + andi L, TEMP, 8 +#endif blez L, .L485 NOP @@ -4338,7 +4436,11 @@ .align 4 .L485: +#ifndef TRMMKERNEL andi L, K, 4 +#else + andi L, TEMP, 4 +#endif blez L, .L486 NOP @@ -4481,7 +4583,11 @@ .align 4 .L486: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L487 NOP @@ -4558,7 +4664,11 @@ .align 4 .L487: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L480 LD ALPHA, 152($fp) @@ -4592,6 +4702,7 @@ .align 4 .L480: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C13 # A1=C13.upper=c12 CVTU A2, C11 # A2=C11.upper=c22 @@ -4762,7 +4873,141 @@ daddiu CO3, CO3, 8 * SIZE bgtz I, .L481 daddiu CO4, CO4, 8 * SIZE +#else + daddiu I, I, -1 + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + CVTU A3, C23 # A3=C23.upper=c14 + CVTU A4, C21 # A4=C21.upper=c24 + CVTU A5, C33 # A5=C33.upper=c16 + CVTU A6, C31 # A6=C31.upper=c26 + CVTU A7, C43 # A7=C43.upper=c18 + CVTU A8, C41 # A8=C41.upper=c28 + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + MUL A3, A3, ALPHA # c14 + MUL A4, A4, ALPHA # c24 + MUL A5, A5, ALPHA # c16 + MUL A6, A6, ALPHA # c26 + MUL A7, A7, ALPHA # c18 + MUL A8, A8, ALPHA # c28 + + MUL C11, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MUL C13, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MUL C21, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MUL C23, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + MUL C31, C31, ALPHA # c16 + ST A5, 5 * SIZE(CO1) + + MUL C33, C33, ALPHA # c26 + ST A6, 5 * SIZE(CO2) + + MUL C41, C41, ALPHA # c18 + ST A7, 7 * SIZE(CO1) + + MUL C43, C43, ALPHA # c28 + ST A8, 7 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + ST C11, 0 * SIZE(CO1) + + CVTU A2, C12 # B2=C14.upper=c32 + ST C13, 0 * SIZE(CO2) + + CVTU A3, C24 # B3=C22.upper=c44 + ST C21, 2 * SIZE(CO1) + + CVTU A4, C22 # B4=C24.upper=c34 + ST C23, 2 * SIZE(CO2) + + CVTU A5, C34 # B5=C32.upper=c46 + ST C31, 4 * SIZE(CO1) + + CVTU A6, C32 # B6=C24.upper=c36 + ST C33, 4 * SIZE(CO2) + + CVTU A7, C44 # B7=C42.upper=c48 + ST C41, 6 * SIZE(CO1) + + CVTU A8, C42 # A1=C44.upper=c38 + ST C43, 6 * SIZE(CO2) + + MUL A1, A1, ALPHA # c31 + MUL A2, A2, ALPHA + MUL A3, A3, ALPHA + MUL A4, A4, ALPHA + MUL A5, A5, ALPHA + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA + + MUL C12, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MUL C14, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MUL C22, C22, ALPHA + ST A3, 3 * SIZE(CO3) + + MUL C24, C24, ALPHA + ST A4, 3 * SIZE(CO4) + + MUL C32, C32, ALPHA + ST A5, 5 * SIZE(CO3) + + MUL C34, C34, ALPHA + ST A6, 5 * SIZE(CO4) + + MUL C42, C42, ALPHA + ST A7, 7 * SIZE(CO3) + + MUL C44, C44, ALPHA + ST A8, 7 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + ST C32, 4 * SIZE(CO3) + ST C34, 4 * SIZE(CO4) + ST C42, 6 * SIZE(CO3) + ST C44, 6 * SIZE(CO4) + + daddiu CO1, CO1, 8 * SIZE + daddiu CO2, CO2, 8 * SIZE + daddiu CO3, CO3, 8 * SIZE + daddiu CO4, CO4, 8 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + + bgtz I, .L481 +#endif .align 4 .L44: @@ -4772,6 +5017,65 @@ .align 4 .L441: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + PLU B4, B2, B2 + +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddu TEMP, KK, 4 +#else + daddu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L442 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 @@ -4806,13 +5110,12 @@ FETCH $0, 0 * SIZE(CO3) MOV C44, C11 - PLU B3, B1, B1 - FETCH $0, 0 * SIZE(CO4) - PLU B4, B2, B2 + FETCH $0, 0 * SIZE(CO4) blez L, .L442 - NOP + PLU B4, B2, B2 +#endif .L4410: # daddiu L, L, -1 @@ -4907,7 +5210,11 @@ .align 4 .L442: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L443 NOP @@ -4960,7 +5267,11 @@ .align 4 .L443: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L440 LD ALPHA, 152($fp) @@ -4981,6 +5292,7 @@ .align 4 .L440: +#ifndef TRMMKERNEL CVTU A1, C13 # A1=C13.upper=c12 LD B1, 1 * SIZE(CO1) @@ -5069,6 +5381,86 @@ daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE +#else + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + CVTU A3, C23 # A3=C23.upper=c14 + CVTU A4, C21 # A4=C21.upper=c24 + + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + MUL A3, A3, ALPHA # c14 + MUL A4, A4, ALPHA # c24 + + MUL C11, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MUL C13, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MUL C21, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MUL C23, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + CVTU A5, C14 # B1=C12.upper=c42 + ST C11, 0 * SIZE(CO1) + + CVTU A6, C12 # B2=C14.upper=c32 + ST C13, 0 * SIZE(CO2) + + CVTU A7, C24 # B3=C22.upper=c44 + ST C21, 2 * SIZE(CO1) + + CVTU A8, C22 # B4=C24.upper=c34 + ST C23, 2 * SIZE(CO2) + + MUL A5, A5, ALPHA # c31 + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA + + MUL C12, C12, ALPHA + ST A5, 1 * SIZE(CO3) + + MUL C14, C14, ALPHA + ST A6, 1 * SIZE(CO4) + + MUL C22, C22, ALPHA + ST A7, 3 * SIZE(CO3) + + MUL C24, C24, ALPHA + ST A8, 3 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif .align 4 .L42: @@ -5078,6 +5470,62 @@ .align 4 .L421: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + PLU B3, B1, B1 + + FETCH $0, 0 * SIZE(CO4) + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L422 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 @@ -5110,13 +5558,12 @@ FETCH $0, 0 * SIZE(CO3) MOV C44, C11 - PLU B3, B1, B1 - FETCH $0, 0 * SIZE(CO4) - PLU B4, B2, B2 + FETCH $0, 0 * SIZE(CO4) blez L, .L422 - NOP + PLU B4, B2, B2 +#endif .L4210: daddiu L, L, -1 @@ -5168,7 +5615,11 @@ .align 4 .L422: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L423 NOP @@ -5196,7 +5647,11 @@ PLU B4, B2, B2 .L423: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L420 LD ALPHA, 152($fp) @@ -5210,6 +5665,7 @@ .align 4 .L420: +#ifndef TRMMKERNEL CVTU A1, C13 # A1=C13.upper=c12 LD B1, 1 * SIZE(CO1) @@ -5256,6 +5712,60 @@ daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE +#else + CVTU A1, C13 # A1=C13.upper=c12 + CVTU A2, C11 # A2=C11.upper=c22 + + MUL A1, A1, ALPHA # c12 + MUL A2, A2, ALPHA # c22 + + MUL C11, C11, ALPHA # c12 + MUL C13, C13, ALPHA # c22 + + CVTU A3, C14 # B1=C12.upper=c42 + CVTU A4, C12 # B2=C14.upper=c32 + + MUL A3, A3, ALPHA # c31 + ST A1, 1 * SIZE(CO1) + + MUL A4, A4, ALPHA + ST A2, 1 * SIZE(CO2) + + MUL C12, C12, ALPHA + ST C11, 0 * SIZE(CO1) + + MUL C14, C14, ALPHA + ST C13, 0 * SIZE(CO2) + + ST A3, 1 * SIZE(CO3) + ST A4, 1 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -4 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif .align 4 @@ -5266,6 +5776,56 @@ .align 4 .L411: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD B1, 0 * SIZE(BO) + + MOV C21, C11 + MOV C22, C11 + LD A1, 0 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B2, 1 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B3, 2 * SIZE(BO) + + MOV C13, C11 + MOV C14, C11 + LD B4, 3 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA))||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L412 + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 @@ -5298,6 +5858,7 @@ MOV C43, C11 blez L, .L412 MOV C44, C11 +#endif .L4110: daddiu L, L, -1 @@ -5362,7 +5923,11 @@ LD B4, 3 * SIZE(BO) .L412: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L413 NOP @@ -5397,7 +5962,11 @@ LD B4, 3 * SIZE(BO) .L413: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L410 LD ALPHA, 152($fp) @@ -5410,6 +5979,7 @@ .align 4 .L410: +#ifndef TRMMKERNEL LD A5, 0 * SIZE(CO1) LD A6, 0 * SIZE(CO2) LD A7, 0 * SIZE(CO3) @@ -5429,9 +5999,47 @@ daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE +#else + MUL A5, C11, ALPHA + MUL A6, C12, ALPHA + MUL A7, C13, ALPHA + MUL A8, C14, ALPHA + + ST A5, 0 * SIZE(CO1) + ST A6, 0 * SIZE(CO2) + ST A7, 0 * SIZE(CO3) + ST A8, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .align 4 .L40: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 4 +#endif daddiu J, J, -1 move B, BO bgtz J, .L48 @@ -5451,13 +6059,75 @@ move AO, A # Reset A move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif daddu CO2, C, LDC blez I, .L24 daddu C, CO2, LDC - .align 4 .L281: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 3 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C13, C11 + LD A7, 6 * SIZE(AO) + + MOV C14, C11 + LD A8, 7 * SIZE(AO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L282 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5500,7 +6170,7 @@ MOV C43, C11 blez L, .L282 MOV C44, C11 - +#endif .align 4 .L2810: @@ -5582,7 +6252,11 @@ .align 4 .L282: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L280 LD ALPHA, 152($fp) @@ -5609,6 +6283,7 @@ .align 4 .L280: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 LD A1, 0 * SIZE(CO1) @@ -5680,6 +6355,72 @@ daddiu CO1, CO1, 8 * SIZE bgtz I, .L281 daddiu CO2, CO2, 8 * SIZE +#else + daddiu I, I, -1 + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + MUL A5, C13, ALPHA + MUL A6, C23, ALPHA + MUL A7, C33, ALPHA + MUL A8, C43, ALPHA + + MUL B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MUL B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MUL B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MUL B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + MUL B5, C14, ALPHA + ST A5, 4 * SIZE(CO1) + + MUL B6, C24, ALPHA + ST A6, 5 * SIZE(CO1) + + MUL B7, C34, ALPHA + ST A7, 6 * SIZE(CO1) + + MUL C11, C44, ALPHA + ST A8, 7 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + ST B5, 4 * SIZE(CO2) + ST B6, 5 * SIZE(CO2) + ST B7, 6 * SIZE(CO2) + ST C11, 7 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L281 + daddiu CO2, CO2, 8 * SIZE +#endif .align 4 @@ -5690,6 +6431,58 @@ .align 4 .L241: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + LD B2, 1 * SIZE(BO) + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L242 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L242 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5723,7 +6516,7 @@ MOV C43, C11 blez L, .L242 MOV C44, C11 - +#endif .align 4 .L2410: @@ -5775,7 +6568,11 @@ .align 4 .L242: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L240 LD ALPHA, 152($fp) @@ -5793,6 +6590,7 @@ .align 4 .L240: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) LD A2, 1 * SIZE(CO1) LD A3, 2 * SIZE(CO1) @@ -5829,6 +6627,50 @@ daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE +#else + + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + + MUL B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MUL B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MUL B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MUL B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif .align 4 .L22: @@ -5838,6 +6680,46 @@ .align 4 .L221: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L222 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5860,6 +6742,7 @@ MOV C43, C11 blez L, .L222 MOV C44, C11 +#endif .align 4 @@ -5895,7 +6778,11 @@ .align 4 .L222: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L220 LD ALPHA, 152($fp) @@ -5909,6 +6796,7 @@ .align 4 .L220: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) LD A2, 1 * SIZE(CO1) @@ -5929,7 +6817,39 @@ daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL B1, C12, ALPHA + MUL B2, C22, ALPHA + + ST A1, 0 * SIZE(CO1) + ST A2, 1 * SIZE(CO1) + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddu KK, KK, 2 +#endif +#endif .align 4 .L21: @@ -5939,6 +6859,46 @@ .align 4 .L211: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 1 + blez L, .L212 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -5960,7 +6920,7 @@ MOV C43, C11 blez L, .L212 MOV C44, C11 - +#endif .align 4 .L2110: @@ -5987,7 +6947,11 @@ .align 4 .L212: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L210 LD ALPHA, 152($fp) @@ -5999,6 +6963,7 @@ .align 4 .L210: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) MADD A1, A1, C11, ALPHA @@ -6011,12 +6976,42 @@ daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE +#else + + MUL A1, C11, ALPHA + MUL B1, C12, ALPHA + + ST A1, 0 * SIZE(CO1) + ST B1, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, 1 +#else + daddiu TEMP, TEMP, 2 +#endif + dsll L, TEMP, BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif .align 4 .L20: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif move B, BO - NOP @@ -6029,12 +7024,76 @@ .L18: dsra I, M, 3 # MR=8 move AO, A # Reset A + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif blez I, .L14 NOP .align 4 .L181: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 3 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD A7, 6 * SIZE(AO) + + MOV C13, C11 + LD A8, 7 * SIZE(AO) + + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 8 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L182 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6076,6 +7135,7 @@ MOV C43, C11 blez L, .L182 MOV C44, C11 +#endif .align 4 @@ -6138,7 +7198,11 @@ .align 4 .L182: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L180 LD ALPHA, 152($fp) @@ -6157,6 +7221,7 @@ .align 4 .L180: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 LD A1, 0 * SIZE(C) @@ -6189,7 +7254,51 @@ daddiu C, C, 8 * SIZE bgtz I, .L181 NOP +#else + daddiu I, I, -1 + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + MUL A5, C13, ALPHA + MUL A6, C23, ALPHA + MUL A7, C33, ALPHA + MUL A8, C43, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + ST A5, 4 * SIZE(C) + ST A6, 5 * SIZE(C) + ST A7, 6 * SIZE(C) + ST A8, 7 * SIZE(C) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK + +#ifdef LEFT + daddiu TEMP, TEMP, -8 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 3 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 8 +#endif + + daddiu C, C, 8 * SIZE + bgtz I, .L181 + NOP +#endif .align 4 .L14: @@ -6199,6 +7308,56 @@ .align 4 .L141: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L142 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6231,7 +7390,7 @@ MOV C43, C11 blez L, .L142 MOV C44, C11 - +#endif .align 4 .L1410: @@ -6270,7 +7429,11 @@ .align 4 .L142: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L140 LD ALPHA, 152($fp) @@ -6284,6 +7447,7 @@ .align 4 .L140: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(C) LD A2, 1 * SIZE(C) LD A3, 2 * SIZE(C) @@ -6299,6 +7463,36 @@ ST A3, 2 * SIZE(C) ST A4, 3 * SIZE(C) daddiu C, C, 4 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + MUL A3, C31, ALPHA + MUL A4, C41, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + daddiu C, C, 4 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif +#endif .align 4 .L12: @@ -6308,6 +7502,48 @@ .align 4 .L121: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) ||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B # Reset B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + + MOV C43, C11 + MOV C44, C11 +#if (defined(LEFT) && !defined(TRANSA)) ||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L122 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6329,7 +7565,7 @@ MOV C43, C11 blez L, .L122 MOV C44, C11 - +#endif .align 4 .L1210: @@ -6355,7 +7591,11 @@ .align 4 .L122: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L120 LD ALPHA, 152($fp) @@ -6367,6 +7607,7 @@ .align 4 .L120: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(C) LD A2, 1 * SIZE(C) @@ -6377,7 +7618,33 @@ ST A2, 1 * SIZE(C) daddiu C, C, 2 * SIZE +#else + MUL A1, C11, ALPHA + MUL A2, C21, ALPHA + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + + daddiu C, C, 2 * SIZE +#if ( defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif .align 4 .L11: @@ -6387,6 +7654,38 @@ .align 4 .L111: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA))||\ + (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, BASE_SHIFT + daddu AO, AO, L + daddu BO, B, L +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD B1, 0 * SIZE(BO) + + MOV C31, C11 + MOV C32, C11 +#if (defined(LEFT) && !defined(TRANSA))||\ + (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 1 + blez L, .L112 + NOP + +#else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 @@ -6401,7 +7700,7 @@ MOV C31, C11 blez L, .L112 MOV C32, C11 - +#endif .align 4 @@ -6425,7 +7724,11 @@ .align 4 .L112: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L110 LD ALPHA, 152($fp) @@ -6436,6 +7739,7 @@ .align 4 .L110: # Write Back +#ifndef TRMMKERNEL LD A1, 0 * SIZE(C) MADD A1, A1, C11, ALPHA @@ -6443,14 +7747,20 @@ ST A1, 0 * SIZE(C) daddiu C, C, 1 * SIZE +#else + MUL A1, C11, ALPHA + + ST A1, 0 * SIZE(C) + daddiu C, C, 1 * SIZE + +#endif .align 4 .L10: move B, BO NOP - .L999: ld $16, 0($fp) ld $17, 8($fp) From 23e182ca7c7cbf3dae151d3d084c074078b075fa Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 15:28:01 +0000 Subject: [PATCH 11/52] Fix stack-pointer bug for strmm. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 116 +++++++++++++--------------- 1 file changed, 55 insertions(+), 61 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 1b4dae892..93002547b 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -3,7 +3,7 @@ #include "common.h" #define FETCH ld -#define STACKSIZE 192 +#define STACKSIZE 160 #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) @@ -127,7 +127,7 @@ # .ent gemm # .type gemm, @function #gemm: -# .frame $fp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 +# .frame $sp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 # .mask 0x40000000,-8 # .fmask 0x00000000,0 # .set noreorder @@ -137,34 +137,34 @@ PROLOGUE daddiu $sp,$sp,-STACKSIZE - sd $fp,184($sp) - move $fp,$sp - sd $16, 0($fp) - sd $17, 8($fp) - sd $18, 16($fp) - sd $19, 24($fp) - sd $20, 32($fp) - sd $21, 40($fp) - sd $22, 48($fp) + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) - ST $f24, 56($fp) - ST $f25, 64($fp) - ST $f26, 72($fp) - ST $f27, 80($fp) - ST $f28, 88($fp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) #if defined(TRMMKERNEL) - sd $23, 96($fp) - sd $24, 104($fp) - sd $25, 112($fp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + + LDARG OFFSET, 160($sp) #endif #ifndef __64BIT__ - ST $f20,120($fp) - ST $f21,128($fp) - ST $f22,136($fp) - ST $f23,144($fp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) #endif .align 4 @@ -172,16 +172,12 @@ dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE -#if defined(TRMMKERNEL) - LD OFFSET, 192($fp) -#endif - #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif blez J, .L2 - ST ALPHA, 152($fp) + ST ALPHA, 152($sp) .L48: dsra I, M, 3 # MR=8 @@ -4670,7 +4666,7 @@ andi L, TEMP, 1 #endif blez L, .L480 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -5273,7 +5269,7 @@ andi L, TEMP, 1 #endif blez L, .L440 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -5653,7 +5649,7 @@ andi L, TEMP, 1 #endif blez L, .L420 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C12, C12, A1, B2 @@ -5968,7 +5964,7 @@ andi L, TEMP, 1 #endif blez L, .L410 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C12, C12, A1, B2 @@ -6258,7 +6254,7 @@ andi L, TEMP, 1 #endif blez L, .L280 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C13, C13, A5, B1 MADD C23, C23, A6, B1 @@ -6574,7 +6570,7 @@ andi L, TEMP, 1 #endif blez L, .L240 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -6784,7 +6780,7 @@ andi L, TEMP, 1 #endif blez L, .L220 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -6953,7 +6949,7 @@ andi L, TEMP, 1 #endif blez L, .L210 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C12, C12, A1, B2 @@ -7204,7 +7200,7 @@ andi L, TEMP, 1 #endif blez L, .L180 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C13, C13, A5, B1 MADD C23, C23, A6, B1 @@ -7435,7 +7431,7 @@ andi L, TEMP, 1 #endif blez L, .L140 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -7597,7 +7593,7 @@ andi L, TEMP, 1 #endif blez L, .L120 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 @@ -7730,7 +7726,7 @@ andi L, TEMP, 1 #endif blez L, .L110 - LD ALPHA, 152($fp) + LD ALPHA, 152($sp) MADD C11, C11, A1, B1 daddiu AO, AO, 1 * SIZE @@ -7762,35 +7758,33 @@ NOP .L999: - ld $16, 0($fp) - ld $17, 8($fp) - ld $18, 16($fp) - ld $19, 24($fp) - ld $20, 32($fp) - ld $21, 40($fp) - ld $22, 48($fp) + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) - LD $f24, 56($fp) - LD $f25, 64($fp) - LD $f26, 72($fp) - LD $f27, 80($fp) - LD $f28, 88($fp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) #if defined(TRMMKERNEL) - ld $23, 96($fp) - ld $24, 104($fp) - ld $25, 112($fp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) #endif #ifndef __64BIT__ - LD $f20,120($fp) - LD $f21,128($fp) - LD $f22,136($fp) - LD $f23,144($fp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) #endif - move $sp,$fp - ld $fp,184($sp) daddiu $sp,$sp,STACKSIZE j $31 nop From a059c553a11ae1ae944161975075326661593a86 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 16:00:04 +0000 Subject: [PATCH 12/52] Fix a compute error for strmm. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 93002547b..2da94e5aa 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -6985,9 +6985,9 @@ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT - daddiu TEMP, TEMP, 1 + daddiu TEMP, TEMP, -1 #else - daddiu TEMP, TEMP, 2 + daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT From 3274ff47b854bff0b0c5e66b24e50cddbafc7dca Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 16:50:50 +0000 Subject: [PATCH 13/52] Fix an error for strmm_LLTN. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 2da94e5aa..6191196f7 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -6465,7 +6465,6 @@ MOV C34, C11 MOV C43, C11 - blez L, .L242 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK From 790614683656cddf8cf4fcbe2933274012dd3314 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 16:57:33 +0000 Subject: [PATCH 14/52] Fix an error for strmm_LLTN. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 6191196f7..efe62384c 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -6069,7 +6069,7 @@ move BO, B #else dsll L, KK, 3 + BASE_SHIFT - dsll TEMP, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP From 74d4cdb81a59393feee3affeb777d9724a5b6ff0 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 2 Sep 2011 19:41:06 +0000 Subject: [PATCH 15/52] Fix an illegal instruction for strmm_RTLU. --- kernel/mips64/sgemm_kernel_8x4_ps.S | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index efe62384c..bc81d0eb5 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -5003,6 +5003,7 @@ #endif bgtz I, .L481 + NOP #endif .align 4 From 4727fe8abfd6fa93bb78347f535bfa86d75263d5 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 5 Sep 2011 15:13:05 +0000 Subject: [PATCH 16/52] Refs #47. On Loongson 3A, set DGEMM_R parameter depending on different number of threads. It would improve double precision BLAS3 on multi-threads. --- common_macro.h | 4 +++- driver/others/blas_server.c | 5 +++++ driver/others/blas_server_omp.c | 5 +++++ driver/others/memory.c | 2 +- driver/others/parameter.c | 28 ++++++++++++++++++++++++++++ param.h | 3 ++- 6 files changed, 44 insertions(+), 3 deletions(-) diff --git a/common_macro.h b/common_macro.h index bcaa9f38b..0c34ecb01 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2127,7 +2127,9 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) +extern BLASLONG gemm_offset_a; +extern BLASLONG gemm_offset_b; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c0f77c4c9..a026ccb26 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -797,6 +797,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4fd4cd440..c45856fd9 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); +#if defined(ARCH_MIPS64) + //set parameters for different number of threads. + blas_set_parameter(); +#endif + } void openblas_set_num_threads(int num_threads) { diff --git a/driver/others/memory.c b/driver/others/memory.c index dd8334477..ac9c87850 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){ if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 9e72fd24f..80f708452 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -45,8 +45,22 @@ int get_L2_size(void); #define DEFAULT_GEMM_P 128 #define DEFAULT_GEMM_Q 128 #define DEFAULT_GEMM_R 128 +#define DEFAULT_GEMM_OFFSET_A 0 +#define DEFAULT_GEMM_OFFSET_B 0 /* Global Parameter */ +#if GEMM_OFFSET_A == gemm_offset_a +BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; +#else +BLASLONG gemm_offset_a = GEMM_OFFSET_A; +#endif + +#if GEMM_OFFSET_B == gemm_offset_b +BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; +#else +BLASLONG gemm_offset_b = GEMM_OFFSET_B; +#endif + #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -666,3 +680,17 @@ void blas_set_parameter(void){ #endif #endif + +#if defined(ARCH_MIPS64) +void blas_set_parameter(void){ +#if defined(LOONGSON3A) + if(blas_num_threads == 1){ + //single thread + dgemm_r = 1000; + }else{ + //multi thread + dgemm_r = 300; + } +#endif +} +#endif diff --git a/param.h b/param.h index ecdae2e67..52a132049 100644 --- a/param.h +++ b/param.h @@ -1507,7 +1507,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //#define DGEMM_DEFAULT_R 200 //#define DGEMM_DEFAULT_R 400 //#define DGEMM_DEFAULT_R 192 -#define DGEMM_DEFAULT_R 1000 +#define DGEMM_DEFAULT_R dgemm_r +//1000 //#define DGEMM_DEFAULT_R 160 //#define DGEMM_DEFAULT_R 270 #define CGEMM_DEFAULT_R 1000 From 64fa709d1f2b758a5bcea3f32f2bb50ddae97e30 Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 5 Sep 2011 16:30:55 +0000 Subject: [PATCH 17/52] Fixed #46. Initialize variables in cblat3.f and zblat3.f. --- test/cblat3.f | 2 ++ test/zblat3.f | 2 ++ 2 files changed, 4 insertions(+) diff --git a/test/cblat3.f b/test/cblat3.f index b26be91e6..5df1ddd64 100644 --- a/test/cblat3.f +++ b/test/cblat3.f @@ -1301,6 +1301,8 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) diff --git a/test/zblat3.f b/test/zblat3.f index d6a522f2a..f03b1a617 100644 --- a/test/zblat3.f +++ b/test/zblat3.f @@ -1303,6 +1303,8 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO + RALS = RONE + RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) From 3c856c0c1a7f8484e87dd564af8b84427baea27b Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 6 Sep 2011 18:27:33 +0000 Subject: [PATCH 18/52] Check the return value of pthread_create. Update the docs with known issue on Loongson 3A. --- README | 1 + driver/others/blas_server.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/README b/README index 248741544..b67db1169 100644 --- a/README +++ b/README @@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve 9.Known Issues * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index a026ccb26..66067a05c 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -500,6 +500,7 @@ static int blas_monitor(void *arg){ /* Initializing routine */ int blas_thread_init(void){ BLASLONG i; + int ret; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif @@ -545,12 +546,16 @@ int blas_thread_init(void){ pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR - pthread_create(&blas_threads[i], &attr, + ret=pthread_create(&blas_threads[i], &attr, (void *)&blas_thread_server, (void *)i); #else - pthread_create(&blas_threads[i], NULL, + ret=pthread_create(&blas_threads[i], NULL, (void *)&blas_thread_server, (void *)i); #endif + if(ret!=0){ + fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret); + exit(1); + } } #ifdef MONITOR From 16fc083322eefd9e309b412e26db6fca62496afc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 8 Sep 2011 16:39:34 +0000 Subject: [PATCH 19/52] Refs #47. Fixed the seting parameter bug on Loongson 3A single thread version. --- driver/others/parameter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 80f708452..4a8542a93 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -684,13 +684,17 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ #if defined(LOONGSON3A) +#ifdef SMP if(blas_num_threads == 1){ +#endif //single thread dgemm_r = 1000; +#ifdef SMP }else{ //multi thread dgemm_r = 300; } #endif +#endif } #endif From d238a768abac572235cbe19db179587ebfc54545 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 14 Sep 2011 15:32:25 +0000 Subject: [PATCH 20/52] Use ps instructions in cgemm. --- kernel/mips64/KERNEL.LOONGSON3A | 6 +- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 921 ++++++++++++++++++ param.h | 6 +- 3 files changed, 929 insertions(+), 4 deletions(-) create mode 100644 kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 4a195f265..91f2e7dd1 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -17,9 +17,13 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = cgemm_kernel_loongson3a_2x2.S +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S new file mode 100644 index 000000000..67d2333cb --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -0,0 +1,921 @@ +##define REALNAME gemm +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define STACKSIZE 192 +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +##### Parameter registers #### +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#### Pointer A, B, C #### +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 + +#define PREA $18 +#define PREB $19 + +#### Used registers #### +#define A1 $f0 +#define A2 $f1 +#define A3 $f2 +#define A4 $f3 +#define A5 $f4 +#define A6 $f5 +#define A7 $f6 +#define A8 $f7 + +#define B1 $f8 +#define B2 $f9 +#define B3 $f10 +#define B4 $f11 +#define B5 $f12 +#define B6 $f13 +#define B7 $f14 +#define B8 $f15 + +#define C11 $f16 +#define C12 $f17 +#define C21 $f18 +#define C22 $f19 +#define C31 $f20 +#define C32 $f21 +#define C41 $f22 +#define C42 $f23 +#define C13 $f24 +#define C14 $f25 +#define C23 $f26 +#define C24 $f27 +#define C33 $f28 +#define C34 $f29 +#define C43 $f30 +#define C44 $f31 + +#define I $2 +#define J $3 +#define L $7 + +#### Alpha register #### +#define ALPHA $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + +#define R12 12 +#define R13 13 + +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + + + PROLOGUE + + daddiu $sp,$sp,-STACKSIZE + + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + +#if defined(TRMMKERNEL) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + + LDARG OFFSET, 160($sp) +#endif + +#ifndef __64BIT__ + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) +#endif + + .align 4 +.L2: + dsra J, N, 1 # NR=2 + ST $f15, 152($sp) + + dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE + blez J, .L1 + ST $f16, 160($sp) + +.L24: + dsra I, M, 2 # MR=8 + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + blez I, .L22 + daddu C, CO2, LDC + + .align 4 +.L241: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + FETCH $0, 4 * SIZE(CO1) + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + FETCH $0, 4 * SIZE(CO2) + + MOV C43, C11 + PLU B3, B1, B1 + + MOV C44, C11 + blez L, .L242 + PLU B4, B2, B2 + +.L2410: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 4) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 5) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 6) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 7) # A7 A8 + MADPS C31, C31, A3, B1 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + bgtz L, .L2410 + MADPS C44, C44, A8, B8 + + + .align 4 +.L242: + andi L, K, 2 + blez L, .L247 + NOP + + .align 4 +.L247: + andi L, K, 1 + blez L, .L240 + NOP + + + .align 4 +.L240: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + SUB C41, C41, A4 + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + + ADD C31, A3, C31 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + ADD C41, A4, C41 + LD B1, 0 * SIZE(CO1) + + SUB C13, A5, C13 # ad'+'cb + LD B3, 2 * SIZE(CO1) + + SUB C23, A6, C23 + LD B5, 4 * SIZE(CO1) + + SUB C33, A7, C33 + LD B7, 6 * SIZE(CO1) + + SUB C43, A8, C43 + LD B2, 1 * SIZE(CO1) + + ADD C12, B1, C12 + LD B4, 3 * SIZE(CO1) + + ADD C22, B2, C22 + LD B6, 5 * SIZE(CO1) + + ADD C32, B3, C32 + LD B8, 7 * SIZE(CO1) + + ADD C42, B4, C42 + MADD B1, B1, C11, A1 # A1 = alpha_r + + SUB C14, B5, C14 + MADD B3, B3, C21, A1 + + SUB C24, B6, C24 + MADD B5, B5, C31, A1 + + SUB C34, B7, C34 + MADD B7, B7, C41, A1 + + SUB C44, B8, C44 + MADD B2, B2, C13, A1 + + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + LD C13, 0 * SIZE(CO2) + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + LD C23, 2 * SIZE(CO2) + + MADD B4, B4, C12, A2 + MADD B6, B6, C13, A2 + LD C33, 4 * SIZE(CO2) + + MADD B8, B8, C14, A2 + LD C43, 6 * SIZE(CO2) + + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + MADD C13, C13, C12, A1 + + LD C41, 7 * SIZE(CO2) + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + + ADD C31, A3, C31 +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + ADD C41, A4, C41 + LD B1, 0 * SIZE(CO1) + + SUB C13, C13, A5 # ad'+'cb + LD B3, 2 * SIZE(CO1) + + SUB C23, C23, A6 + LD B5, 4 * SIZE(CO1) + + SUB C33, C33, A7 + LD B7, 6 * SIZE(CO1) + + SUB C43, C43, A8 + LD B2, 1 * SIZE(CO1) + + ADD C12, B1, C12 + LD B4, 3 * SIZE(CO1) + + ADD C22, B2, C22 + LD B6, 5 * SIZE(CO1) + + ADD C32, B3, C32 + LD B8, 7 * SIZE(CO1) + + ADD C42, B4, C42 + MADD B1, B1, C11, A1 # A1 = alpha_r + + SUB C14, C14, B5 + MADD B3, B3, C21, A1 + + SUB C24, C24, B6 + MADD B5, B5, C31, A1 + + SUB C34, C34, B7 + MADD B7, B7, C41, A1 + + SUB C44, C44, B8 + MADD B2, B2, C13, A1 + + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + LD C13, 0 * SIZE(CO2) + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + LD C23, 2 * SIZE(CO2) + + MADD B4, B4, C12, A2 + MADD B6, B6, C13, A2 + LD C33, 4 * SIZE(CO2) + + MADD B8, B8, C14, A2 + LD C43, 6 * SIZE(CO2) + + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + MADD C13, C13, C12, A1 + + LD C41, 7 * SIZE(CO2) + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + MADD C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + MADD C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + MADD C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + MADD C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, A1, C11 # ac'+'bd + SUB C21, A2, C21 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + + SUB C31, A3, C31 +# LD A2, 0 * SIZE(A) # load alpha_i + LD A2, 160($sp) + + SUB C41, A4, C41 + LD B1, 0 * SIZE(CO1) + + ADD C13, A5, C13 # ad'+'cb + LD B3, 2 * SIZE(CO1) + + ADD C23, A6, C23 + LD B5, 4 * SIZE(CO1) + + ADD C33, A7, C33 + LD B7, 6 * SIZE(CO1) + + ADD C43, A8, C43 + LD B2, 1 * SIZE(CO1) + + SUB C12, B1, C12 + LD B4, 3 * SIZE(CO1) + + SUB C22, B2, C22 + LD B6, 5 * SIZE(CO1) + + SUB C32, B3, C32 + LD B8, 7 * SIZE(CO1) + + SUB C42, B4, C42 + MADD B1, B1, C11, A1 # A1 = alpha_r + + ADD C14, B5, C14 + MADD B3, B3, C21, A1 + + ADD C24, B6, C24 + MADD B5, B5, C31, A1 + + ADD C34, B7, C34 + MADD B7, B7, C41, A1 + + ADD C44, B8, C44 + NMSUB B2, B2, C13, A1 + + NMSUB B4, B4, C23, A1 + NMSUB B6, B6, C33, A1 + + NMSUB B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + LD C13, 0 * SIZE(CO2) + + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + LD C23, 2 * SIZE(CO2) + + MADD B4, B4, C12, A2 + MADD B6, B6, C13, A2 + LD C33, 4 * SIZE(CO2) + + MADD B8, B8, C14, A2 + LD C43, 6 * SIZE(CO2) + + LD C11, 1 * SIZE(CO2) + LD C21, 3 * SIZE(CO2) + LD C31, 5 * SIZE(CO2) + MADD C13, C13, C12, A1 + + LD C41, 7 * SIZE(CO2) + MADD C23, C23, C22, A1 + + MADD C33, C33, C32, A1 + ST B1, 0 * SIZE(CO1) + + MADD C43, C43, C42, A1 + ST B3, 2 * SIZE(CO1) + + NMSUB C11, C11, C14, A1 + ST B5, 4 * SIZE(CO1) + + NMSUB C21, C21, C24, A1 + ST B7, 6 * SIZE(CO1) + + NMSUB C31, C31, C34, A1 + ST B2, 1 * SIZE(CO1) + + NMSUB C41, C41, C44, A1 + ST B4, 3 * SIZE(CO1) + + NMSUB C13, C13, C14, A2 + ST B6, 5 * SIZE(CO1) + + NMSUB C23, C23, C24, A2 + ST B8, 7 * SIZE(CO1) + + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L241 + daddiu CO2, CO2, 8 * SIZE + + .align 4 +.L22: + andi I, M, 2 # MR=4 + blez I, .L21 + NOP + + .align 4 +.L21: + andi I, M, 1 + blez I, .L20 + NOP + + .align 4 +.L20: + daddiu J, J, -1 + move B, BO + bgtz J, .L24 + NOP + + + .align 4 +.L1: + andi J, N, 1 + blez J, .L999 + NOP + + .align 4 +.L10: + move B, BO + +.L999: + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + +#if defined(TRMMKERNEL) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) +#endif + +#ifndef __64BIT__ + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) +#endif + + daddiu $sp,$sp,STACKSIZE + j $31 + nop + + EPILOGUE diff --git a/param.h b/param.h index 52a132049..1c729e8b9 100644 --- a/param.h +++ b/param.h @@ -1486,7 +1486,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 @@ -1499,7 +1499,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 112 -#define CGEMM_DEFAULT_Q 100 +#define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1024 @@ -1511,7 +1511,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //1000 //#define DGEMM_DEFAULT_R 160 //#define DGEMM_DEFAULT_R 270 -#define CGEMM_DEFAULT_R 1000 +#define CGEMM_DEFAULT_R 1024 //#define ZGEMM_DEFAULT_R 1000 #define ZGEMM_DEFAULT_R 1000 From 9679dd077e59407860dfa82e11d4f7ba07468496 Mon Sep 17 00:00:00 2001 From: traz Date: Wed, 14 Sep 2011 20:00:35 +0000 Subject: [PATCH 21/52] Fix some compute error. --- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 219 +++++++----------- 1 file changed, 82 insertions(+), 137 deletions(-) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 67d2333cb..7371ba280 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -12,10 +12,10 @@ #define M $4 #define N $5 #define K $6 -#define A $8 -#define B $9 -#define C $10 -#define LDC $11 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 #### Pointer A, B, C #### #define AO $12 @@ -120,6 +120,7 @@ PROLOGUE + LDARG LDC, 0($sp) daddiu $sp,$sp,-STACKSIZE sd $16, 0($sp) @@ -141,7 +142,7 @@ sd $24, 104($sp) sd $25, 112($sp) - LDARG OFFSET, 160($sp) + LDARG OFFSET, STACKSIZE($sp) #endif #ifndef __64BIT__ @@ -379,13 +380,12 @@ /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 - LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i - - SUB C41, C41, A4 ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 ADD C33, A7, C33 @@ -488,78 +488,60 @@ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r - LD A1, 152($sp) # load alpha_r - ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r - - ADD C41, A4, C41 - LD B1, 0 * SIZE(CO1) - SUB C13, A5, C13 # ad'+'cb - LD B3, 2 * SIZE(CO1) - SUB C23, A6, C23 - LD B5, 4 * SIZE(CO1) - SUB C33, A7, C33 - LD B7, 6 * SIZE(CO1) - SUB C43, A8, C43 - LD B2, 1 * SIZE(CO1) - ADD C12, B1, C12 - LD B4, 3 * SIZE(CO1) - ADD C22, B2, C22 - LD B6, 5 * SIZE(CO1) - ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, B5, C14 + SUB C24, B6, C24 + SUB C34, B7, C34 + SUB C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) - ADD C42, B4, C42 MADD B1, B1, C11, A1 # A1 = alpha_r - - SUB C14, B5, C14 MADD B3, B3, C21, A1 - - SUB C24, B6, C24 MADD B5, B5, C31, A1 - - SUB C34, B7, C34 MADD B7, B7, C41, A1 - - SUB C44, B8, C44 MADD B2, B2, C13, A1 - MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 - MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - LD C13, 0 * SIZE(CO2) - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - LD C23, 2 * SIZE(CO2) - MADD B4, B4, C12, A2 MADD B6, B6, C13, A2 - LD C33, 4 * SIZE(CO2) - MADD B8, B8, C14, A2 - LD C43, 6 * SIZE(CO2) + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) - MADD C13, C13, C12, A1 - LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 MADD C23, C23, C22, A1 MADD C33, C33, C32, A1 @@ -611,78 +593,60 @@ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r - LD A1, 152($sp) # load alpha_r - ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r - LD A2, 160($sp) # load alpha_i - ADD C41, A4, C41 - LD B1, 0 * SIZE(CO1) - + LD A2, 160($sp) # load alpha_i SUB C13, C13, A5 # ad'+'cb - LD B3, 2 * SIZE(CO1) - SUB C23, C23, A6 - LD B5, 4 * SIZE(CO1) - SUB C33, C33, A7 - LD B7, 6 * SIZE(CO1) - SUB C43, C43, A8 - LD B2, 1 * SIZE(CO1) - ADD C12, B1, C12 - LD B4, 3 * SIZE(CO1) - ADD C22, B2, C22 - LD B6, 5 * SIZE(CO1) - ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, C14, B5 + SUB C24, C24, B6 + SUB C34, C34, B7 + SUB C44, C44, B8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) - ADD C42, B4, C42 MADD B1, B1, C11, A1 # A1 = alpha_r - - SUB C14, C14, B5 MADD B3, B3, C21, A1 - - SUB C24, C24, B6 MADD B5, B5, C31, A1 - - SUB C34, C34, B7 MADD B7, B7, C41, A1 - - SUB C44, C44, B8 MADD B2, B2, C13, A1 - MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 - MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - LD C13, 0 * SIZE(CO2) - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - LD C23, 2 * SIZE(CO2) - MADD B4, B4, C12, A2 MADD B6, B6, C13, A2 - LD C33, 4 * SIZE(CO2) - MADD B8, B8, C14, A2 - LD C43, 6 * SIZE(CO2) + LD C13, 0 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) - MADD C13, C13, C12, A1 - LD C41, 7 * SIZE(CO2) + + MADD C13, C13, C12, A1 MADD C23, C23, C22, A1 MADD C33, C33, C32, A1 @@ -731,113 +695,94 @@ #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ - SUB C11, A1, C11 # ac'+'bd - SUB C21, A2, C21 + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r - - SUB C31, A3, C31 -# LD A2, 0 * SIZE(A) # load alpha_i + SUB C41, C41, A4 LD A2, 160($sp) - - SUB C41, A4, C41 - LD B1, 0 * SIZE(CO1) +# LD A2, 0 * SIZE(A) # load alpha_i ADD C13, A5, C13 # ad'+'cb - LD B3, 2 * SIZE(CO1) - ADD C23, A6, C23 - LD B5, 4 * SIZE(CO1) - ADD C33, A7, C33 - LD B7, 6 * SIZE(CO1) - ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) - - SUB C12, B1, C12 LD B4, 3 * SIZE(CO1) - - SUB C22, B2, C22 LD B6, 5 * SIZE(CO1) - - SUB C32, B3, C32 LD B8, 7 * SIZE(CO1) - SUB C42, B4, C42 MADD B1, B1, C11, A1 # A1 = alpha_r - - ADD C14, B5, C14 MADD B3, B3, C21, A1 - - ADD C24, B6, C24 MADD B5, B5, C31, A1 - - ADD C34, B7, C34 MADD B7, B7, C41, A1 - - ADD C44, B8, C44 NMSUB B2, B2, C13, A1 - NMSUB B4, B4, C23, A1 NMSUB B6, B6, C33, A1 - NMSUB B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - LD C13, 0 * SIZE(CO2) - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - LD C23, 2 * SIZE(CO2) - MADD B4, B4, C12, A2 MADD B6, B6, C13, A2 - LD C33, 4 * SIZE(CO2) - MADD B8, B8, C14, A2 - LD C43, 6 * SIZE(CO2) + LD C13, 0 * SIZE(CO2) + LD C43, 6 * SIZE(CO2) + LD C23, 2 * SIZE(CO2) + LD C33, 4 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) - MADD C13, C13, C12, A1 - LD C41, 7 * SIZE(CO2) - MADD C23, C23, C22, A1 - MADD C33, C33, C32, A1 + MADD C13, C13, C12, A1 ST B1, 0 * SIZE(CO1) - MADD C43, C43, C42, A1 + MADD C23, C23, C22, A1 ST B3, 2 * SIZE(CO1) - NMSUB C11, C11, C14, A1 + MADD C33, C33, C32, A1 ST B5, 4 * SIZE(CO1) - NMSUB C21, C21, C24, A1 + MADD C43, C43, C42, A1 ST B7, 6 * SIZE(CO1) - NMSUB C31, C31, C34, A1 + NMSUB C11, C11, C14, A1 ST B2, 1 * SIZE(CO1) - NMSUB C41, C41, C44, A1 + NMSUB C21, C21, C24, A1 ST B4, 3 * SIZE(CO1) - NMSUB C13, C13, C14, A2 + NMSUB C31, C31, C34, A1 ST B6, 5 * SIZE(CO1) - NMSUB C23, C23, C24, A2 + NMSUB C41, C41, C44, A1 ST B8, 7 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 - MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 From 7fa3d23dd91ba6aaae6f77f210f338ba55422e49 Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 15 Sep 2011 16:08:23 +0000 Subject: [PATCH 22/52] Complete cgemm function, but no optimization. --- common_mips64.h | 2 + .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 1689 ++++++++++++++++- 2 files changed, 1652 insertions(+), 39 deletions(-) diff --git a/common_mips64.h b/common_mips64.h index 2aa325bfa..35d8265bc 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -152,6 +152,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.d #define CMPLE c.le.d #define CMPLT c.lt.d +#define NEG neg.d #else #define LD lwc1 #define ST swc1 @@ -177,6 +178,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define MADPS madd.ps #define CVTU cvt.s.pu #define CVTL cvt.s.pl +#define NEG neg.s #endif #if defined(__64BIT__) && defined(USE64BITINT) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 7371ba280..b57213a24 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -185,9 +185,9 @@ MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 + gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C41, C11 MOV C42, C11 - gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 @@ -195,20 +195,21 @@ MOV C23, C11 FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) MOV C24, C11 - FETCH $0, 4 * SIZE(CO1) MOV C33, C11 FETCH $0, 0 * SIZE(CO2) - MOV C34, C11 - FETCH $0, 4 * SIZE(CO2) - - MOV C43, C11 - PLU B3, B1, B1 - MOV C44, C11 - blez L, .L242 + FETCH $0, 8 * SIZE(CO2) + MOV C34, C11 + MOV C43, C11 + + PLU B3, B1, B1 PLU B4, B2, B2 + blez L, .L242 + MOV C44, C11 .L2410: daddiu L, L, -1 @@ -234,9 +235,9 @@ MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 - PLU B7, B5, B5 - MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 PLU B8, B6, B6 MADPS C34, C34, A3, B4 @@ -264,9 +265,9 @@ MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 - PLU B3, B1, B1 - MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 PLU B4, B2, B2 MADPS C34, C34, A7, B8 @@ -282,12 +283,12 @@ gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C31, C31, A3, B1 - daddiu BO, BO, 16 * SIZE # 4KR*4NR + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR MADPS C41, C41, A4, B1 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 - daddiu AO, AO, 32 * SIZE # 4KR*8MR + daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 @@ -296,9 +297,9 @@ MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 - PLU B7, B5, B5 - MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 PLU B8, B6, B6 MADPS C34, C34, A3, B4 @@ -326,9 +327,9 @@ MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 - PLU B3, B1, B1 - MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 PLU B4, B2, B2 MADPS C34, C34, A7, B8 @@ -342,12 +343,100 @@ blez L, .L247 NOP + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 4 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + MADPS C24, C24, A6, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + .align 4 .L247: andi L, K, 1 blez L, .L240 NOP + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu BO, BO, 1 * 4 * SIZE # 4KR*4NR + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + daddiu AO, AO, 2 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + .align 4 .L240: # Write Back @@ -417,13 +506,10 @@ MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i - NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 - NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 @@ -528,9 +614,9 @@ NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C12, A2 - MADD B6, B6, C13, A2 - MADD B8, B8, C14, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C23, 2 * SIZE(CO2) @@ -633,9 +719,9 @@ NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C12, A2 - MADD B6, B6, C13, A2 - MADD B8, B8, C14, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C23, 2 * SIZE(CO2) @@ -716,6 +802,14 @@ ADD C24, B6, C24 ADD C34, B7, C34 ADD C44, B8, C44 + NEG C13, C13 + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + NEG C14, C14 + NEG C24, C24 + NEG C34, C34 + NEG C44, C44 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) @@ -730,18 +824,18 @@ MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 - NMSUB B2, B2, C13, A1 - NMSUB B4, B4, C23, A1 - NMSUB B6, B6, C33, A1 - NMSUB B8, B8, C43, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 - MADD B4, B4, C12, A2 - MADD B6, B6, C13, A2 - MADD B8, B8, C14, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C43, 6 * SIZE(CO2) @@ -764,16 +858,16 @@ MADD C43, C43, C42, A1 ST B7, 6 * SIZE(CO1) - NMSUB C11, C11, C14, A1 + MADD C11, C11, C14, A1 ST B2, 1 * SIZE(CO1) - NMSUB C21, C21, C24, A1 + MADD C21, C21, C24, A1 ST B4, 3 * SIZE(CO1) - NMSUB C31, C31, C34, A1 + MADD C31, C31, C34, A1 ST B6, 5 * SIZE(CO1) - NMSUB C41, C41, C44, A1 + MADD C41, C41, C44, A1 ST B8, 7 * SIZE(CO1) NMSUB C13, C13, C14, A2 @@ -807,12 +901,700 @@ blez I, .L21 NOP + .align 4 +.L221: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) + + PLU B3, B1, B1 + blez L, .L222 + PLU B4, B2, B2 + +.L2210: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A3, B5 + MADPS C21, C21, A4, B5 + + MADPS C12, C12, A3, B6 + MADPS C22, C22, A4, B6 + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + PLU B3, B1, B1 + PLU B4, B2, B2 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A5, B1 + MADPS C21, C21, A6, B1 + + MADPS C12, C12, A5, B2 + MADPS C22, C22, A6, B2 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A5, B3 + MADPS C23, C23, A6, B3 + + MADPS C14, C14, A5, B4 + MADPS C24, C24, A6, B4 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A7, B5 + MADPS C21, C21, A8, B5 + + MADPS C12, C12, A7, B6 + MADPS C22, C22, A8, B6 + + MADPS C13, C13, A7, B7 + MADPS C23, C23, A8, B7 + + MADPS C14, C14, A7, B8 + MADPS C24, C24, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L2210 + PLU B4, B2, B2 + + + .align 4 +.L222: + andi L, K, 2 + blez L, .L227 + NOP + + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE + + daddiu AO, AO, 2 * 4 * SIZE + MADPS C11, C11, A3, B5 + MADPS C21, C21, A4, B5 + gsLQC1(R13, F9, F8, 0) # A1 A2 + + MADPS C12, C12, A3, B6 + MADPS C22, C22, A4, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L227: + andi L, K, 1 + blez L, .L220 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 4 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + .align 4 +.L220: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + CVTU A5, C12 + CVTU A6, C22 + + CVTU A7, C14 + CVTU A8, C24 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, A7, C14 + SUB C24, A8, C24 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, C14, A7 + SUB C24, C24, A8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + NEG C13, C13 + NEG C23, C23 + NEG C14, C14 + NEG C24, C24 + + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + LD B5, 0 * SIZE(CO2) + LD B7, 2 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + LD B8, 3 * SIZE(CO2) + + MADD B5, B5, C12, A1 + MADD B7, B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MADD B6, B6, C14, A1 + MADD B8, B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + .align 4 .L21: andi I, M, 1 blez I, .L20 NOP + .align 4 +.L211: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + + PLU B3, B1, B1 + blez L, .L212 + PLU B4, B2, B2 + +.L2110: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 2) # B1 B2 + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + gsLQC1(R13, F13, F12, 3) # B3 B4 + MADPS C11, C11, A3, B1 + MADPS C12, C12, A3, B2 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + + daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A3, B3 + MADPS C14, C14, A3, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A4, B5 + MADPS C12, C12, A4, B6 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C13, C13, A4, B7 + MADPS C14, C14, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L2110 + PLU B4, B2, B2 + + + .align 4 +.L212: + andi L, K, 2 + blez L, .L217 + NOP + + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + daddiu AO, AO, 4 * SIZE + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + gsLQC1(R12, F1, F0, 0) # A5 A6 + gsLQC1(R13, F9, F8, 0) # B1 B2 + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L217: + andi L, K, 1 + blez L, .L210 + NOP + + MADPS C11, C11, A1, B1 + daddiu BO, BO, 4 * SIZE + MADPS C12, C12, A1, B2 + daddiu AO, AO, 2 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + .align 4 +.L210: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + CVTU A5, C12 + CVTU A7, C14 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + SUB C14, A7, C14 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + SUB C14, C14, A7 + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + NEG C13, C13 + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + NEG C14, C14 + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + LD B5, 0 * SIZE(CO2) + LD B6, 1 * SIZE(CO2) + + MADD B5, B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MADD B6, B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + .align 4 .L20: daddiu J, J, -1 @@ -827,6 +1609,835 @@ blez J, .L999 NOP +.L14: + dsra I, M, 2 # MR=8 + move AO, A # Reset A + move CO1, C + + blez I, .L12 + daddu C, CO1, LDC + + .align 4 +.L141: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C31, C11 + MOV C41, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C33, C11 + MOV C43, C11 + + FETCH $0, 8 * SIZE(CO1) + PLU B3, B1, B1 + blez L, .L142 + PLU B4, B2, B2 + +.L1410: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + gsLQC1(R12, F1, F0, 4) # A1 A2 + MADPS C11, C11, A5, B2 + MADPS C21, C21, A6, B2 + + gsLQC1(R12, F3, F2, 5) # A3 A4 + MADPS C31, C31, A7, B2 + MADPS C41, C41, A8, B2 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C13, C13, A5, B4 + MADPS C23, C23, A6, B4 + + MADPS C33, C33, A7, B4 + MADPS C43, C43, A8, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A1, B5 + MADPS C21, C21, A2, B5 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + gsLQC1(R12, F7, F6, 7) # A7 A8 + MADPS C31, C31, A3, B5 + MADPS C41, C41, A4, B5 + + daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR + MADPS C13, C13, A1, B7 + MADPS C23, C23, A2, B7 + + MADPS C33, C33, A3, B7 + MADPS C43, C43, A4, B7 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A5, B6 + MADPS C21, C21, A6, B6 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B6 + MADPS C41, C41, A8, B6 + + MADPS C13, C13, A5, B8 + MADPS C23, C23, A6, B8 + + MADPS C33, C33, A7, B8 + MADPS C43, C43, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L1410 + PLU B4, B2, B2 + + + .align 4 +.L142: + andi L, K, 2 + blez L, .L147 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + gsLQC1(R13, F13, F8, 1) # B3 B4 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A5, B2 + MADPS C21, C21, A6, B2 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C31, C31, A7, B2 + MADPS C41, C41, A8, B2 + daddiu BO, BO, 4 * SIZE # 4KR*4NR + + MADPS C13, C13, A5, B4 + MADPS C23, C23, A6, B4 + + MADPS C33, C33, A7, B4 + MADPS C43, C43, A8, B4 + PLU B3, B1, B1 + + + .align 4 +.L147: + andi L, K, 1 + blez L, .L140 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + daddiu AO, AO, 2 * 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + + .align 4 +.L140: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + NEG C13, C13 # ad'+'cb + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B5, 4 * SIZE(CO1) + LD B7, 6 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + LD B6, 5 * SIZE(CO1) + LD B8, 7 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B5, B5, C31, A1 + MADD B7, B7, C41, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + MADD B6, B6, C33, A1 + MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + + bgtz I, .L141 + daddiu CO1, CO1, 8 * SIZE + + .align 4 +.L12: + andi I, M, 2 # MR=4 + blez I, .L11 + NOP + + .align 4 +.L121: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + + PLU B3, B1, B1 + blez L, .L122 + PLU B4, B2, B2 + +.L1210: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + gsLQC1(R12, F5, F4, 2) # A5 A6 + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + MADPS C11, C11, A3, B2 + MADPS C21, C21, A4, B2 + + gsLQC1(R12, F7, F6, 3) # A7 A8 + MADPS C13, C13, A3, B4 + MADPS C23, C23, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C11, C11, A7, B6 + MADPS C21, C21, A8, B6 + + MADPS C13, C13, A7, B8 + MADPS C23, C23, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L1210 + PLU B4, B2, B2 + + + .align 4 +.L122: + andi L, K, 2 + blez L, .L127 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + PLU B7, B5, B5 + daddiu BO, BO, 1 * 4 * SIZE + + daddiu AO, AO, 2 * 4 * SIZE + MADPS C11, C11, A3, B2 + MADPS C21, C21, A4, B2 + + MADPS C13, C13, A3, B4 + MADPS C23, C23, A4, B4 + + gsLQC1(R13, F9, F8, 0) + gsLQC1(R12, F1, F0, 0) + PLU B3, B1, B1 + + .align 4 +.L127: + andi L, K, 1 + blez L, .L120 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + .align 4 +.L120: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + NEG C13, C13 # ad'+'cb + NEG C23, C23 + + LD B1, 0 * SIZE(CO1) + LD B3, 2 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + LD B4, 3 * SIZE(CO1) + + MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B3, B3, C21, A1 + MADD B2, B2, C13, A1 + MADD B4, B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + + .align 4 +.L11: + andi I, M, 1 + blez I, .L10 + NOP + + .align 4 +.L111: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + + FETCH $0, 0 * SIZE(CO1) + + PLU B3, B1, B1 + blez L, .L112 + PLU B4, B2, B2 + +.L1110: + daddiu L, L, -1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + MADPS C11, C11, A1, B1 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MADPS C13, C13, A1, B3 + daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR + + PLU B7, B5, B5 + PLU B8, B6, B6 + daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR + + MADPS C11, C11, A2, B2 + MADPS C13, C13, A2, B4 + + MADPS C11, C11, A3, B5 + MADPS C13, C13, A3, B7 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MADPS C11, C11, A4, B6 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MADPS C13, C13, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L1110 + PLU B4, B2, B2 + + + .align 4 +.L112: + andi L, K, 2 + blez L, .L117 + NOP + + MADPS C11, C11, A1, B1 + MADPS C13, C13, A1, B3 + daddiu BO, BO, 4 * SIZE + daddiu AO, AO, 4 * SIZE + + MADPS C11, C11, A2, B2 + MADPS C13, C13, A2, B4 + + gsLQC1(R13, F9, F8, 0) + gsLQC1(R12, F1, F0, 0) + PLU B3, B1, B1 + + + .align 4 +.L117: + andi L, K, 1 + blez L, .L110 + NOP + + daddiu BO, BO, 2 * SIZE + daddiu AO, AO, 2 * SIZE + + MADPS C11, C11, A1, B1 + MADPS C13, C13, A1, B3 + + + .align 4 +.L110: # Write Back + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + NEG C13, C13 + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) + + LD B1, 0 * SIZE(CO1) + LD B2, 1 * SIZE(CO1) + + MADD B1, B1, C11, A4 # A1 = alpha_r + MADD B2, B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + .align 4 .L10: move B, BO From ee4bb8bd2554f8cc5c539b2d9fc56d09836a338b Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 16 Sep 2011 16:08:39 +0000 Subject: [PATCH 23/52] Add ctrmm part in cgemm_kernel_loongson3a_4x2_ps.S. --- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 504 +++++++++++++++++- 1 file changed, 491 insertions(+), 13 deletions(-) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index b57213a24..16502216f 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -142,7 +142,7 @@ sd $24, 104($sp) sd $25, 112($sp) - LDARG OFFSET, STACKSIZE($sp) + LDARG OFFSET, STACKSIZE+8($sp) #endif #ifndef __64BIT__ @@ -157,59 +157,132 @@ dsra J, N, 1 # NR=2 ST $f15, 152($sp) +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE blez J, .L1 ST $f16, 160($sp) .L24: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + dsra I, M, 2 # MR=8 move AO, A # Reset A + + dsll PREA, K, 1 + ZBASE_SHIFT move CO1, C daddu CO2, C, LDC + daddu PREA, AO, PREA + blez I, .L22 daddu C, CO2, LDC .align 4 .L241: - move BO, B # Reset B - dsra L, K, 2 # UnRoll K=64 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, B, TEMP +#endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 + dsll PREB, K, ZBASE_SHIFT MOV C21, C11 MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C31, C11 MOV C32, C11 - gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C41, C11 MOV C42, C11 + gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C13, C11 MOV C14, C11 - gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C23, C11 - FETCH $0, 0 * SIZE(CO1) - - FETCH $0, 8 * SIZE(CO1) MOV C24, C11 - - MOV C33, C11 - FETCH $0, 0 * SIZE(CO2) - FETCH $0, 8 * SIZE(CO2) + MOV C33, C11 MOV C34, C11 + MOV C43, C11 + MOV C44, C11 PLU B3, B1, B1 PLU B4, B2, B2 + daddu PREB, BO, PREB + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 blez L, .L242 + NOP + +#else + + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=64 + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, ZBASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + gsLQC1(R13, F9, F8, 0) # B1 B2 + MOV C31, C11 + MOV C32, C11 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C41, C11 + MOV C42, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C23, C11 + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 MOV C44, C11 + daddu PREB, BO, PREB + + PLU B3, B1, B1 + PLU B4, B2, B2 + + FETCH $0, 8 * SIZE(CO1) + blez L, .L242 + FETCH $0, 8 * SIZE(CO2) +#endif .L2410: daddiu L, L, -1 @@ -225,9 +298,11 @@ MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 + FETCH $0, 0 * SIZE(PREB) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREA) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 @@ -239,6 +314,7 @@ PLU B7, B5, B5 PLU B8, B6, B6 + daddu PREB, PREB, 8 * SIZE MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -255,6 +331,7 @@ MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 + FETCH $0, 8 * SIZE(PREA) MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 @@ -283,9 +360,10 @@ gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C31, C31, A3, B1 - daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR MADPS C41, C41, A4, B1 + daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR + FETCH $0, 16 * SIZE(PREA) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR @@ -317,11 +395,13 @@ MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 + FETCH $0, 24 * SIZE(PREA) MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 + daddu PREA, PREA, 32 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -339,7 +419,11 @@ .align 4 .L242: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L247 NOP @@ -407,7 +491,11 @@ .align 4 .L247: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L240 NOP @@ -440,6 +528,7 @@ .align 4 .L240: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -891,6 +980,395 @@ #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B3, 2 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B5, 4 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B7, 6 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B2, 1 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B4, 3 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B6, 5 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B8, 7 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, B5, C14 + SUB C24, B6, C24 + SUB C34, B7, C34 + SUB C44, B8, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B1, 0 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B3, 2 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B5, 4 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B7, 6 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B2, 1 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B4, 3 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B6, 5 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST B8, 7 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + ADD C12, B1, C12 + ADD C22, B2, C22 + ADD C32, B3, C32 + ADD C42, B4, C42 + SUB C14, C14, B5 + SUB C24, C24, B6 + + SUB C34, C34, B7 + SUB C44, C44, B8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B1, 0 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B3, 2 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B5, 4 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B7, 6 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B2, 1 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B4, 3 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B6, 5 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST B8, 7 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + SUB C12, C12, B1 + SUB C22, C22, B2 + SUB C32, C32, B3 + SUB C42, C42, B4 + ADD C14, B5, C14 + ADD C24, B6, C24 + ADD C34, B7, C34 + ADD C44, B8, C44 + + NEG C13, C13 + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + NEG C14, C14 + NEG C24, C24 + NEG C34, C34 + NEG C44, C44 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + MUL C13, C12, A1 + MUL C23, C22, A1 + + ST B3, 2 * SIZE(CO1) + MUL C33, C32, A1 + MUL C43, C42, A1 + + ST B5, 4 * SIZE(CO1) + MUL C11, C14, A1 + MUL C21, C24, A1 + + ST B7, 6 * SIZE(CO1) + MUL C31, C34, A1 + MUL C41, C44, A1 + + ST B2, 1 * SIZE(CO1) + NMSUB C13, C13, C14, A2 + NMSUB C23, C23, C24, A2 + + ST B4, 3 * SIZE(CO1) + NMSUB C33, C33, C34, A2 + NMSUB C43, C43, C44, A2 + + ST B6, 5 * SIZE(CO1) + MADD C11, C11, C12, A2 + MADD C21, C21, C22, A2 + + ST B8, 7 * SIZE(CO1) + MADD C31, C31, C32, A2 + MADD C41, C41, C42, A2 + + ST C13, 0 * SIZE(CO2) + ST C23, 2 * SIZE(CO2) + ST C33, 4 * SIZE(CO2) + ST C43, 6 * SIZE(CO2) + ST C11, 1 * SIZE(CO2) + ST C21, 3 * SIZE(CO2) + ST C31, 5 * SIZE(CO2) + ST C41, 7 * SIZE(CO2) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 2 + ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + +#endif daddiu CO1, CO1, 8 * SIZE bgtz I, .L241 daddiu CO2, CO2, 8 * SIZE From e08cfaf9ca9a65e28c4e0f790421aa03e7041c94 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 16 Sep 2011 17:50:40 +0000 Subject: [PATCH 24/52] Complete all the complex single-precision functions of level3, but the performance needs further improve. --- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 1091 ++++++++++++++++- 1 file changed, 1081 insertions(+), 10 deletions(-) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 16502216f..e78ad209f 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -1381,6 +1381,49 @@ .align 4 .L221: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + + FETCH $0, 8 * SIZE(CO1) + MOV C24, C11 + + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 8 * SIZE(CO2) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # MR=2 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L222 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -1407,6 +1450,7 @@ PLU B3, B1, B1 blez L, .L222 PLU B4, B2, B2 +#endif .L2210: daddiu L, L, -1 @@ -1484,7 +1528,11 @@ .align 4 .L222: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L227 NOP @@ -1527,7 +1575,11 @@ .align 4 .L227: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L220 NOP @@ -1547,6 +1599,7 @@ .align 4 .L220: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -1800,6 +1853,239 @@ ST B8, 3 * SIZE(CO2) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + CVTU A5, C12 + CVTU A6, C22 + + CVTU A7, C14 + CVTU A8, C24 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, A7, C14 + SUB C24, A8, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + ADD C22, A6, C22 + SUB C14, C14, A7 + SUB C24, C24, A8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) + +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + SUB C22, C22, A6 + ADD C14, A7, C14 + ADD C24, A8, C24 + NEG C13, C13 + NEG C23, C23 + NEG C14, C14 + NEG C24, C24 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + MUL B5, C12, A1 + MUL B7, C22, A1 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + MUL B6, C14, A1 + MUL B8, C24, A1 + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + NMSUB B7, B7, C24, A2 + + MADD B6, B6, C12, A2 + MADD B8, B8, C22, A2 + + ST B5, 0 * SIZE(CO2) + ST B7, 2 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + ST B8, 3 * SIZE(CO2) +#endif + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + +#endif daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE @@ -1812,6 +2098,41 @@ .align 4 .L211: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C14, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L212 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -1829,6 +2150,7 @@ PLU B3, B1, B1 blez L, .L212 PLU B4, B2, B2 +#endif .L2110: daddiu L, L, -1 @@ -1880,7 +2202,11 @@ .align 4 .L212: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L217 NOP @@ -1910,7 +2236,11 @@ .align 4 .L217: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L210 NOP @@ -1924,13 +2254,13 @@ .align 4 .L210: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 CVTU A5, C12 CVTU A7, C14 - #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd @@ -2069,6 +2399,149 @@ ST B6, 1 * SIZE(CO2) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + CVTU A5, C12 + CVTU A7, C14 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C12, A5, C12 + SUB C14, A7, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) + +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + ADD C12, A5, C12 + SUB C14, C14, A7 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + SUB C12, C12, A5 + ADD C14, A7, C14 + NEG C13, C13 + NEG C14, C14 + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + MUL B5, C12, A4 + ST B1, 0 * SIZE(CO1) + MUL B6, C14, A4 + ST B2, 1 * SIZE(CO1) + + NMSUB B5, B5, C14, A2 + MADD B6, B6, C12, A2 + + ST B5, 0 * SIZE(CO2) + ST B6, 1 * SIZE(CO2) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + +#endif daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE @@ -2077,6 +2550,11 @@ .L20: daddiu J, J, -1 move B, BO + +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + bgtz J, .L24 NOP @@ -2090,13 +2568,58 @@ .L14: dsra I, M, 2 # MR=8 move AO, A # Reset A - move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move CO1, C blez I, .L12 daddu C, CO1, LDC .align 4 .L141: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 2 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C31, C11 + MOV C41, C11 + + gsLQC1(R12, F3, F2, 1) # A3 A4 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + MOV C33, C11 + MOV C43, C11 + + FETCH $0, 8 * SIZE(CO1) + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 # define Mr=4 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L142 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -2120,6 +2643,7 @@ PLU B3, B1, B1 blez L, .L142 PLU B4, B2, B2 +#endif .L1410: daddiu L, L, -1 @@ -2193,7 +2717,11 @@ .align 4 .L142: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L147 NOP @@ -2232,7 +2760,11 @@ .align 4 .L147: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L140 NOP @@ -2253,6 +2785,7 @@ .align 4 .L140: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -2433,20 +2966,20 @@ #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ - SUB C11, C11, A1 # ac'+'bd + SUB C11, C11, A1 # AC'+'BD SUB C21, C21, A2 SUB C31, C31, A3 - LD A1, 152($sp) # load alpha_r -# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # LOAD ALPHA_R +# LD A1, 0 * SIZE(A) # LOAD ALPHA_R SUB C41, C41, A4 LD A2, 160($sp) -# LD A2, 0 * SIZE(A) # load alpha_i +# LD A2, 0 * SIZE(A) # LOAD ALPHA_I - ADD C13, A5, C13 # ad'+'cb + ADD C13, A5, C13 # AD'+'CB ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 - NEG C13, C13 # ad'+'cb + NEG C13, C13 # AD'+'CB NEG C23, C23 NEG C33, C33 NEG C43, C43 @@ -2461,7 +2994,7 @@ LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) - MADD B1, B1, C11, A1 # A1 = alpha_r + MADD B1, B1, C11, A1 # A1 = ALPHA_R MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 @@ -2469,6 +3002,74 @@ MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = ALPHA_I + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C31 + CVTU A4, C41 + + CVTU A5, C13 + CVTU A6, C23 + + CVTU A7, C33 + CVTU A8, C43 + + CVTU B1, C12 + CVTU B2, C22 + + CVTU B3, C32 + CVTU B4, C42 + + CVTU B5, C14 + CVTU B6, C24 + + CVTU B7, C34 + CVTU B8, C44 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 +# LD A1, 0 * SIZE(A) # load alpha_r + SUB C31, C31, A3 + LD A1, 152($sp) # load alpha_r + SUB C41, C41, A4 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + ADD C13, A5, C13 # ad'+'cb + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 @@ -2488,6 +3089,159 @@ ST B8, 7 * SIZE(CO1) #endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + SUB C13, A5, C13 # ad'+'cb + SUB C23, A6, C23 + SUB C33, A7, C33 + SUB C43, A8, C43 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 +# LD A1, 0 * SIZE(A) # load alpha_r + ADD C31, A3, C31 + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + ADD C41, A4, C41 + LD A2, 160($sp) # load alpha_i + SUB C13, C13, A5 # ad'+'cb + SUB C23, C23, A6 + SUB C33, C33, A7 + SUB C43, C43, A8 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # AC'+'BD + SUB C21, C21, A2 + SUB C31, C31, A3 + LD A1, 152($sp) # LOAD ALPHA_R +# LD A1, 0 * SIZE(A) # LOAD ALPHA_R + SUB C41, C41, A4 + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # LOAD ALPHA_I + + ADD C13, A5, C13 # AD'+'CB + ADD C23, A6, C23 + ADD C33, A7, C33 + ADD C43, A8, C43 + NEG C13, C13 # AD'+'CB + NEG C23, C23 + NEG C33, C33 + NEG C43, C43 + + MUL B1, C11, A1 # A1 = ALPHA_R + MUL B3, C21, A1 + MUL B5, C31, A1 + MUL B7, C41, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + MUL B6, C33, A1 + MUL B8, C43, A1 + NMSUB B1, B1, C13, A2 # A2 = ALPHA_I + NMSUB B3, B3, C23, A2 + NMSUB B5, B5, C33, A2 + NMSUB B7, B7, C43, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + MADD B6, B6, C31, A2 + MADD B8, B8, C41, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B5, 4 * SIZE(CO1) + ST B7, 6 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) + ST B6, 5 * SIZE(CO1) + ST B8, 7 * SIZE(CO1) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 2 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + +#endif bgtz I, .L141 daddiu CO1, CO1, 8 * SIZE @@ -2499,6 +3253,42 @@ .align 4 .L121: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C21, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + MOV C23, C11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 8 * SIZE(CO1) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L122 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -2516,6 +3306,7 @@ PLU B3, B1, B1 blez L, .L122 PLU B4, B2, B2 +#endif .L1210: daddiu L, L, -1 @@ -2561,7 +3352,11 @@ .align 4 .L122: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L127 NOP @@ -2588,7 +3383,11 @@ .align 4 .L127: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L120 NOP @@ -2602,6 +3401,7 @@ .align 4 .L120: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 @@ -2737,6 +3537,141 @@ ST B4, 3 * SIZE(CO1) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A2, C21 + + CVTU A3, C13 + CVTU A4, C23 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, A3, C13 # ad'+'cb + SUB C23, A4, C23 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_r + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + ADD C21, A2, C21 + SUB C13, C13, A3 # ad'+'cb + SUB C23, C23, A4 +# LD A1, 0 * SIZE(A) # load alpha_r + LD A1, 152($sp) # load alpha_r +# LD A2, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + SUB C21, C21, A2 + ADD C13, A3, C13 # ad'+'cb + ADD C23, A4, C23 + LD A1, 152($sp) # load alpha_r +# LD A1, 0 * SIZE(A) # load alpha_r + LD A2, 160($sp) +# LD A2, 0 * SIZE(A) # load alpha_i + NEG C13, C13 # ad'+'cb + NEG C23, C23 + + MUL B1, C11, A1 # A1 = alpha_r + MUL B3, C21, A1 + MUL B2, C13, A1 + MUL B4, C23, A1 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + NMSUB B3, B3, C23, A2 + MADD B2, B2, C11, A2 + MADD B4, B4, C21, A2 + + ST B1, 0 * SIZE(CO1) + ST B3, 2 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) + ST B4, 3 * SIZE(CO1) +#endif +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + +#endif daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE @@ -2749,6 +3684,37 @@ .align 4 .L111: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + MTC $0, C11 # CLEAR REAULTS REGISTERS + gsLQC1(R13, F9, F8, 0) # B1 B2 + + gsLQC1(R12, F1, F0, 0) # A1 A2 + MOV C13, C11 + + FETCH $0, 0 * SIZE(CO1) + + PLU B3, B1, B1 + PLU B4, B2, B2 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L112 + NOP + +#else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 @@ -2763,6 +3729,7 @@ PLU B3, B1, B1 blez L, .L112 PLU B4, B2, B2 +#endif .L1110: daddiu L, L, -1 @@ -2796,7 +3763,11 @@ .align 4 .L112: +#ifndef TRMMKERNEL andi L, K, 2 +#else + andi L, TEMP, 2 +#endif blez L, .L117 NOP @@ -2815,7 +3786,11 @@ .align 4 .L117: +#ifndef TRMMKERNEL andi L, K, 1 +#else + andi L, TEMP, 1 +#endif blez L, .L110 NOP @@ -2828,11 +3803,11 @@ .align 4 .L110: # Write Back +#ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 - #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd @@ -2912,6 +3887,99 @@ ST B2, 1 * SIZE(CO1) #endif +#else + daddiu I, I, -1 + CVTU A1, C11 + CVTU A3, C13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + /* (a + bi) * (c + di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb +# LD A1, 0 * SIZE(A) # load alpha_r + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i +# LD A2, 0 * SIZE(A) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + /* (a + bi) * (c - di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, A3, C13 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + /* (a - bi) * (c + di) */ + ADD C11, A1, C11 # ac'+'bd + SUB C13, C13, A3 # ad'+'cb + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) # load alpha_i + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + /* (a - bi) * (c - di) */ + SUB C11, C11, A1 # ac'+'bd + ADD C13, A3, C13 # ad'+'cb + NEG C13, C13 + LD A4, 152($sp) # load alpha_r + LD A2, 160($sp) + + MUL B1, C11, A4 # A1 = alpha_r + MUL B2, C13, A4 + NMSUB B1, B1, C13, A2 # A2 = alpha_i + MADD B2, B2, C11, A2 + + ST B1, 0 * SIZE(CO1) + ST B2, 1 * SIZE(CO1) +#endif + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + +#endif daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE @@ -2919,6 +3987,9 @@ .align 4 .L10: move B, BO +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 1 +#endif .L999: ld $16, 0($sp) From 831858b88351e350c9f6ad5c2d7f0c70d4cce18c Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 23 Sep 2011 20:59:48 +0000 Subject: [PATCH 25/52] Modify aligned address of sa and sb to improve the performance of multi-threads. --- driver/level3/gemm_thread_n.c | 6 +++--- driver/others/parameter.c | 4 ++-- param.h | 25 +++++++------------------ 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index ba54612eb..62907fa65 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,15 +71,15 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; + queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; //NULL; + queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;//NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { queue[0].sa = sa; - queue[0].sb = sb; + queue[0].sb = sa + GEMM_OFFSET_A1 * 5; queue[num_cpu - 1].next = NULL; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 4a8542a93..fc7f0447e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -688,11 +688,11 @@ void blas_set_parameter(void){ if(blas_num_threads == 1){ #endif //single thread - dgemm_r = 1000; + dgemm_r = 1024; #ifdef SMP }else{ //multi thread - dgemm_r = 300; + dgemm_r = 200; } #endif #endif diff --git a/param.h b/param.h index 1c729e8b9..4ffe05cf8 100644 --- a/param.h +++ b/param.h @@ -1493,33 +1493,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_P 44 #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 112 -#define CGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 1024 -//#define DGEMM_DEFAULT_R 300 -//#define DGEMM_DEFAULT_R 200 -//#define DGEMM_DEFAULT_R 400 -//#define DGEMM_DEFAULT_R 192 -#define DGEMM_DEFAULT_R dgemm_r -//1000 -//#define DGEMM_DEFAULT_R 160 -//#define DGEMM_DEFAULT_R 270 +#define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R 1024 -//#define ZGEMM_DEFAULT_R 1000 -#define ZGEMM_DEFAULT_R 1000 +#define ZGEMM_DEFAULT_R 1024 -#define GEMM_OFFSET_A1 (DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*SIZE) -//#define GEMM_OFFSET_B1 0x10 -#define GEMM_OFFSET_B1 (DGEMM_DEFAULT_Q*DGEMM_DEFAULT_R*SIZE) -#define GEMM_OFFSET 0x100000 -#define GEMM_OFFSET1 0x40000 +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 #define SYMV_P 16 #endif From 9fe3049de67495e1ca916141624c985a80f3d6cb Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 26 Sep 2011 15:21:45 +0000 Subject: [PATCH 26/52] Adding conditional compilation(#if defined(LOONGSON3A)) to avoid affecting the performance of other platforms. --- driver/level3/gemm_thread_n.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index 62907fa65..f9007f831 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; - queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; //NULL; - queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;//NULL; +#if defined(LOONGSON3A) + queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; + queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; +#else + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; +#endif queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { +#if defined(LOONGSON3A) queue[0].sa = sa; queue[0].sb = sa + GEMM_OFFSET_A1 * 5; - +#else + queue[0].sa = sa; + queue[0].sb = sb; +#endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, From c1e618ea2d7fc44c6e90c2cb728124249e688947 Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 3 Nov 2011 13:53:48 +0000 Subject: [PATCH 27/52] Add complete gemv function on Loongson3a platform. --- kernel/mips64/KERNEL.LOONGSON3A | 10 +++ kernel/mips64/gemv_n_loongson3a.c | 98 ++++++++++++++++++++++++++++++ kernel/mips64/gemv_t_loongson3a.c | 93 ++++++++++++++++++++++++++++ kernel/mips64/zgemv_n_loongson3a.c | 92 ++++++++++++++++++++++++++++ kernel/mips64/zgemv_t_loongson3a.c | 91 +++++++++++++++++++++++++++ 5 files changed, 384 insertions(+) create mode 100644 kernel/mips64/gemv_n_loongson3a.c create mode 100644 kernel/mips64/gemv_t_loongson3a.c create mode 100644 kernel/mips64/zgemv_n_loongson3a.c create mode 100644 kernel/mips64/zgemv_t_loongson3a.c diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 91f2e7dd1..fc247e473 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,6 +1,16 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c + + SGEMMKERNEL = sgemm_kernel_8x4_ps.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c new file mode 100644 index 000000000..bb27379f5 --- /dev/null +++ b/kernel/mips64/gemv_n_loongson3a.c @@ -0,0 +1,98 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) +#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) +#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) +#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) +{ + + if(!ALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 4; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c new file mode 100644 index 000000000..5c6c8389e --- /dev/null +++ b/kernel/mips64/gemv_t_loongson3a.c @@ -0,0 +1,93 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) +#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) +#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) +#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!ALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 3; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c new file mode 100644 index 000000000..f8275c371 --- /dev/null +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -0,0 +1,92 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) +//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) +#define spec_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define norm_loop_alpha1 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) +#define norm_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 60; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj=0; + + + if(rALPHA == 1 && iALPHA == 0) { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c new file mode 100644 index 000000000..4b2c2b6b5 --- /dev/null +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -0,0 +1,91 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) +#define spec_loop do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define norm_loop_alpha1 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj=0; + + + if(rALPHA == 1 && iALPHA == 0) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +} From a32e56500ac4cfad0e60a6a4f7671bfee54195e6 Mon Sep 17 00:00:00 2001 From: traz Date: Fri, 4 Nov 2011 19:32:21 +0000 Subject: [PATCH 28/52] Fix the compute error of gemv when incx and incy are negative numbers. --- kernel/mips64/gemv_n_loongson3a.c | 23 +++++++++++++---------- kernel/mips64/gemv_t_loongson3a.c | 8 ++++---- kernel/mips64/zgemv_n_loongson3a.c | 8 ++++---- kernel/mips64/zgemv_t_loongson3a.c | 8 ++++---- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c index bb27379f5..7db595449 100644 --- a/kernel/mips64/gemv_n_loongson3a.c +++ b/kernel/mips64/gemv_n_loongson3a.c @@ -16,13 +16,16 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + BLASLONG kx=0, ky=0; if(!ALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; + //if(INCX < 0) + // kx = (1-N) * INCX; + // INCX = -INCX; + //if(INCY < 0) + // ky = (1-M) * INCY; + // INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 4; @@ -31,7 +34,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO if(ALPHA == 1) { if(INCY == 1) { - for(; likely(j < N); j++, k += INCX) { + for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); @@ -46,8 +49,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO } } } else { - for(; likely(j < N); j++, k += INCX) { - BLASLONG i = 0, h = 0; + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = ky; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[h + fahead]); @@ -63,7 +66,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO } } else { if(INCY == 1) { - for(; likely(j < N); j++, k += INCX) { + for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); @@ -78,8 +81,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO } } } else { - for(; likely(j < N); j++, k += INCX) { - BLASLONG i = 0, h = 0; + for(k=kx; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = ky; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[h + fahead]); diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c index 5c6c8389e..51f035d8e 100644 --- a/kernel/mips64/gemv_t_loongson3a.c +++ b/kernel/mips64/gemv_t_loongson3a.c @@ -18,10 +18,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLO if(!ALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; +// if(INCX < 0) +// INCX = -INCX; +// if(INCY < 0) +// INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 3; diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c index f8275c371..7b094de80 100644 --- a/kernel/mips64/zgemv_n_loongson3a.c +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -19,10 +19,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, F if(!rALPHA && iALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; +// if(INCX < 0) +// INCX = -INCX; +// if(INCY < 0) +// INCY = -INCY; BLASLONG fahead = 60; BLASLONG spec_unroll = 2; diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c index 4b2c2b6b5..3835879ad 100644 --- a/kernel/mips64/zgemv_t_loongson3a.c +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -18,10 +18,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, F if(!rALPHA && iALPHA) return 0; - if(INCX < 0) - INCX = -INCX; - if(INCY < 0) - INCY = -INCY; +// if(INCX < 0) +// INCX = -INCX; +// if(INCY < 0) +// INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 2; From b95ad4cfafdadb2cf7d7baae70d42a83d246f50a Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 9 Nov 2011 19:28:22 +0000 Subject: [PATCH 29/52] Support detecting ICT Loongson-3B CPU. --- common_mips64.h | 12 +++++- cpuid_mips.c | 19 ++++++++- driver/others/parameter.c | 2 +- getarch.c | 15 ++++++++ kernel/mips64/KERNEL.LOONGSON3B | 68 +++++++++++++++++++++++++++++++++ param.h | 41 ++++++++++++++++++++ 6 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 kernel/mips64/KERNEL.LOONGSON3B diff --git a/common_mips64.h b/common_mips64.h index 35d8265bc..15f947eb8 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -101,10 +101,13 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) unsigned long long tmp; __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); ret=tmp; +#elif defined(LOONGSON3B) + //Temp Implementation. + return 1; #else __asm__ __volatile__(".set push \n" ".set mips32r2\n" @@ -234,6 +237,11 @@ REALNAME: ;\ #define FIXED_PAGESIZE (16UL << 10) #endif +#if defined(LOONGSON3B) +#define PAGESIZE (16UL << 10) +#define FIXED_PAGESIZE (16UL << 10) +#endif + #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif @@ -245,7 +253,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) || defined(LOONGSON3B) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips.c b/cpuid_mips.c index f50a4ec3e..217492dd7 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_SICORTEX 1 #define CPU_LOONGSON3A 2 +#define CPU_LOONGSON3B 3 static char *cpuname[] = { "UNKOWN", "SICORTEX", - "LOONGSON3A" + "LOONGSON3A", + "LOONGSON3B" }; int detect(void){ @@ -101,6 +103,8 @@ int detect(void){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ @@ -130,6 +134,8 @@ void get_architecture(void){ void get_subarchitecture(void){ if(detect()==CPU_LOONGSON3A) { printf("LOONGSON3A"); + }else if(detect()==CPU_LOONGSON3B){ + printf("LOONGSON3B"); }else{ printf("SICORTEX"); } @@ -149,6 +155,15 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); + }else if(detect()==CPU_LOONGSON3B){ + printf("#define LOONGSON3B\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define SICORTEX\n"); printf("#define L1_DATA_SIZE 32768\n"); @@ -164,6 +179,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_LOONGSON3A) { printf("loongson3a\n"); + }else if(detect()==CPU_LOONGSON3B) { + printf("loongson3b\n"); }else{ #ifdef __mips64 printf("mips64\n"); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index fc7f0447e..3e660220e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -683,7 +683,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3A) || defined(LOONGSON3B) #ifdef SMP if(blas_num_threads == 1){ #endif diff --git a/getarch.c b/getarch.c index df052df8a..5b614472a 100644 --- a/getarch.c +++ b/getarch.c @@ -117,6 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ +/* #define FORCE_LOONGSON3B */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_GENERIC */ /* #define FORCE_SPARC */ @@ -548,6 +549,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3B +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "LOONGSON3B" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DLOONGSON3B " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "loongson3b" +#define CORENAME "LOONGSON3B" +#else +#endif + #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B new file mode 100644 index 000000000..fc247e473 --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3B @@ -0,0 +1,68 @@ +SAXPYKERNEL=axpy_loongson3a.S +DAXPYKERNEL=daxpy_loongson3a_simd.S + +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c + + +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/param.h b/param.h index 4ffe05cf8..39f0d996c 100644 --- a/param.h +++ b/param.h @@ -1513,6 +1513,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#ifdef LOONGSON3B +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 + +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 80 + +#define SGEMM_DEFAULT_R 1024 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_R 1024 + +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 + +#define SYMV_P 16 +#endif + #ifdef GENERIC #define SNUMOPT 2 From 2d78fb05c8a2fda923fec94aeb5eb16f1bf7671f Mon Sep 17 00:00:00 2001 From: traz Date: Thu, 10 Nov 2011 15:38:48 +0000 Subject: [PATCH 30/52] Add conjugate condition to gemv. --- kernel/mips64/zgemv_n_loongson3a.c | 79 ++++++++++++++++++++++++------ kernel/mips64/zgemv_t_loongson3a.c | 66 +++++++++++++++++++------ 2 files changed, 113 insertions(+), 32 deletions(-) diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c index 7b094de80..3b1b6f73b 100644 --- a/kernel/mips64/zgemv_n_loongson3a.c +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -1,34 +1,81 @@ -#include "common.h" +#include "common.h" -//These are auto-tuning codes on Loongson-3A platform. +//typedef int BLASLONG; +//typedef double FLOAT; -//#define prefetch(x) __builtin_prefetch(x) -//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) -#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) -//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) -#define spec_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) -#define norm_loop_alpha1 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) -#define norm_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) +#if !defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_0 +#define spec_loop spec_loop_0 +#define norm_loop_alpha1 norm_loop_alpha1_0 +#define norm_loop norm_loop_0 +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_1 +#define spec_loop spec_loop_1 +#define norm_loop_alpha1 norm_loop_alpha1_1 +#define norm_loop norm_loop_1 +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_2 +#define spec_loop spec_loop_2 +#define norm_loop_alpha1 norm_loop_alpha1_2 +#define norm_loop norm_loop_2 +#endif + +#if defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_3 +#define spec_loop spec_loop_3 +#define norm_loop_alpha1 norm_loop_alpha1_3 +#define norm_loop norm_loop_3 +#endif + +#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) + +#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { if(!rALPHA && iALPHA) return 0; -// if(INCX < 0) -// INCX = -INCX; -// if(INCY < 0) -// INCY = -INCY; - BLASLONG fahead = 60; BLASLONG spec_unroll = 2; BLASLONG tMQ = M - M % spec_unroll; - BLASLONG j = 0, k = 0, jj=0; - + BLASLONG j = 0, k = 0, jj = 0; if(rALPHA == 1 && iALPHA == 0) { if(INCY == 1) { diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c index 3835879ad..3af44caf2 100644 --- a/kernel/mips64/zgemv_t_loongson3a.c +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -1,33 +1,67 @@ -#include "common.h" +#include "common.h" -//These are auto-tuning codes on Loongson-3A platform. -//#define prefetch(x) __builtin_prefetch(x) -//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) -#define spec_loop_alpha1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) -//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) -#define spec_loop do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) -#define norm_loop_alpha1 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) -#define norm_loop do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#if !defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_0 +#define spec_loop spec_loop_0 +#define norm_loop_alpha1 norm_loop_alpha1_0 +#define norm_loop norm_loop_0 +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_1 +#define spec_loop spec_loop_1 +#define norm_loop_alpha1 norm_loop_alpha1_1 +#define norm_loop norm_loop_1 +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_2 +#define spec_loop spec_loop_2 +#define norm_loop_alpha1 norm_loop_alpha1_2 +#define norm_loop norm_loop_2 +#endif + +#if defined(CONJ) && defined(XCONJ) +#define spec_loop_alpha1 spec_loop_alpha1_3 +#define spec_loop spec_loop_3 +#define norm_loop_alpha1 norm_loop_alpha1_3 +#define norm_loop norm_loop_3 +#endif + + +#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) + +#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) + +#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) + +#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { if(!rALPHA && iALPHA) return 0; -// if(INCX < 0) -// INCX = -INCX; -// if(INCY < 0) -// INCY = -INCY; - BLASLONG fahead = 30; BLASLONG spec_unroll = 2; BLASLONG tMQ = M - M % spec_unroll; - BLASLONG j = 0, k = 0, jj=0; - + BLASLONG j = 0, k = 0, jj = 0; if(rALPHA == 1 && iALPHA == 0) { if(INCX == 1) { From d1baf14a64d93062763f9899fa9c2d7e4bad62a3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 11 Nov 2011 17:49:41 +0000 Subject: [PATCH 31/52] Enable thread affinity on Loongson 3B. Fixed the bug of reading cycle counter. In Loongson 3A and 3B, the CPU core increases the counter in every 2 cycles by default. --- Makefile.system | 2 ++ common_mips64.h | 28 +++++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Makefile.system b/Makefile.system index 84f41a78f..985f95084 100644 --- a/Makefile.system +++ b/Makefile.system @@ -591,9 +591,11 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) +ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif +endif ifdef NO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY diff --git a/common_mips64.h b/common_mips64.h index 15f947eb8..5db96c4aa 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -101,13 +101,15 @@ static void INLINE blas_lock(volatile unsigned long *address){ static inline unsigned int rpcc(void){ unsigned long ret; -#if defined(LOONGSON3A) - unsigned long long tmp; - __asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); - ret=tmp; -#elif defined(LOONGSON3B) - //Temp Implementation. - return 1; +#if defined(LOONGSON3A) || defined(LOONGSON3B) + // unsigned long long tmp; + //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); + //ret=tmp; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $2\n" + ".set pop": "=r"(ret):: "memory"); + #else __asm__ __volatile__(".set push \n" ".set mips32r2\n" @@ -117,6 +119,18 @@ static inline unsigned int rpcc(void){ return ret; } +//#if defined(LOONGSON3A) || defined(LOONGSON3B) +static inline int WhereAmI(void){ + int ret=0; + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $0\n" + ".set pop": "=r"(ret):: "memory"); + return ret; + +} +//#endif + static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } From 285e69e2d16c6c4d5addcc124801c1aed01b1e2d Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 17 Nov 2011 16:46:26 +0000 Subject: [PATCH 32/52] Disable using simple thread level3 to fix a bug on Loongson 3B. --- Makefile.system | 9 ++++++++- common_mips64.h | 9 +++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 985f95084..da3820fec 100644 --- a/Makefile.system +++ b/Makefile.system @@ -275,7 +275,12 @@ endif BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) +CCOMMON_OPT += -march=mips64 +FCOMMON_OPT += -march=mips64 +endif + +ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif @@ -529,8 +534,10 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) +ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif +endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 diff --git a/common_mips64.h b/common_mips64.h index 5db96c4aa..560f2c372 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -119,7 +119,8 @@ static inline unsigned int rpcc(void){ return ret; } -//#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3A) || defined(LOONGSON3B) +#define WHEREAMI static inline int WhereAmI(void){ int ret=0; __asm__ __volatile__(".set push \n" @@ -129,7 +130,7 @@ static inline int WhereAmI(void){ return ret; } -//#endif +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; @@ -252,8 +253,8 @@ REALNAME: ;\ #endif #if defined(LOONGSON3B) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) +#define PAGESIZE (32UL << 10) +#define FIXED_PAGESIZE (32UL << 10) #endif #ifndef PAGESIZE From ef6f7f32ae1ed01d65acce15d6c209ee5caee4c0 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 23 Nov 2011 17:17:41 +0000 Subject: [PATCH 33/52] Fixed mbind bug on Loongson 3B. Check the return value of my_mbind function. --- common_linux.h | 4 ++++ driver/others/memory.c | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/common_linux.h b/common_linux.h index 8b3d44bfa..45a688d23 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,9 +68,13 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { +#if defined (LOONGSON3B) + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); +#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/driver/others/memory.c b/driver/others/memory.c index ac9c87850..feb45eb58 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -389,12 +389,13 @@ static void *alloc_mmap(void *address){ if (map_address != (void *)-1) { #ifdef OS_LINUX -#ifdef DEBUG - int ret; +#if 1 + //#ifdef DEBUG + int ret=0; ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; - perror("alloc_mmap:"); + perror("OpenBLAS alloc_mmap:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); } From 8163ab7e55969d395a6ecd8881b2678e38e8b146 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Wed, 23 Nov 2011 18:40:35 +0000 Subject: [PATCH 34/52] Change the block size on Loongson 3B. --- driver/level3/gemm_thread_mn.c | 4 ++-- driver/level3/gemm_thread_variable.c | 4 ++-- driver/others/parameter.c | 17 ++++++++++++++++- param.h | 6 +++--- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 321e88f0c..b81c6fa40 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -77,8 +77,8 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 9d83e950a..9ffe17040 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -55,8 +55,8 @@ int CNAME(int mode, range_M[0] = 0; i = arg -> m; } else { - range_M[0] = range_M[0]; - i = range_M[1] - range_M[0]; + range_M[0] = range_m[0]; + i = range_m[1] - range_m[0]; } num_cpu_m = 0; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 3e660220e..21f56e889 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -683,7 +683,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3A) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -696,5 +696,20 @@ void blas_set_parameter(void){ } #endif #endif + +#if defined(LOONGSON3B) +#ifdef SMP + if(blas_num_threads == 1 || blas_num_threads == 2){ +#endif + //single thread + dgemm_r = 640; +#ifdef SMP + }else{ + //multi thread + dgemm_r = 160; + } +#endif +#endif + } #endif diff --git a/param.h b/param.h index 39f0d996c..610eb5fab 100644 --- a/param.h +++ b/param.h @@ -1502,10 +1502,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 1024 +#define SGEMM_DEFAULT_R 640 #define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R 1024 -#define ZGEMM_DEFAULT_R 1024 +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 From 66904fc4e8c43d05231487ab0e063417141be4f6 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Fri, 25 Nov 2011 11:20:25 +0000 Subject: [PATCH 35/52] BLAS3 used standard MIPS instructions without extensions on Loongson 3B. --- kernel/mips64/KERNEL.LOONGSON3B | 16 +- kernel/mips64/cgemm_kernel_loongson3b_2x2.S | 1468 +++++++++++ kernel/mips64/dgemm_kernel_loongson3b_4x4.S | 2579 +++++++++++++++++++ kernel/mips64/sgemm_kernel_loongson3b_4x4.S | 2579 +++++++++++++++++++ kernel/mips64/zgemm_kernel_loongson3b_2x2.S | 1468 +++++++++++ param.h | 4 +- 6 files changed, 8100 insertions(+), 14 deletions(-) create mode 100644 kernel/mips64/cgemm_kernel_loongson3b_2x2.S create mode 100644 kernel/mips64/dgemm_kernel_loongson3b_4x4.S create mode 100644 kernel/mips64/sgemm_kernel_loongson3b_4x4.S create mode 100644 kernel/mips64/zgemm_kernel_loongson3b_2x2.S diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B index fc247e473..b98f263c4 100644 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ b/kernel/mips64/KERNEL.LOONGSON3B @@ -11,33 +11,25 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c -SGEMMKERNEL = sgemm_kernel_8x4_ps.S -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMKERNEL = sgemm_kernel_loongson3b_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMKERNEL = dgemm_kernel_loongson3b_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S -CGEMMINCOPY = ../generic/zgemm_ncopy_4.c -CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMKERNEL = cgemm_kernel_loongson3b_2x2.S CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o -ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMKERNEL = zgemm_kernel_loongson3b_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o diff --git a/kernel/mips64/cgemm_kernel_loongson3b_2x2.S b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S new file mode 100644 index 000000000..5ded7aed0 --- /dev/null +++ b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S @@ -0,0 +1,1468 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) + + MOV c41, c11 + MOV c42, c11 + FETCH $0, 4 * SIZE(CO2) + + MOV c43, c11 + NOP + FETCH $0, 4 * SIZE(CO1) + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddiu AO, AO, 16 * SIZE + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + daddu PREB, PREB, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + NOP + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD1 c31, c31, a7, b7 # A1xB2 + NOP + + MADD3 c33, c33, a7, b8 + daddiu PREB, PREB, 16 * SIZE + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L36 + NOP + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu PREA, PREA, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + +# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + +# gsLQC1(R13, F13, F12, 2) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + +# gsLQC1(R12, F11, F10, 3) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + daddiu L, L, -1 + +# gsLQC1(R13, F16, F15, 3) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/kernel/mips64/dgemm_kernel_loongson3b_4x4.S b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S new file mode 100644 index 000000000..4a8c9b0e4 --- /dev/null +++ b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S @@ -0,0 +1,2579 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # OFFSET is relate to the data part +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte + +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,C,LDC + + daddu PREB,BO,SPANB # PreB point next panelB + daddu CO3,CO2,LDC + + daddu PREA,AO,SPANA + daddu CO4,CO3,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + beqz M,.L14_M2 + daddu C,CO4,LDC # move C to next panel Cj + +.L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) +#else + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K # move A B to data part + daddu B,BO,TEMP +#endif + + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 4 # S=L,U=L +#else + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part +#endif + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + dsra K,KCO,2 # K=KCO/2 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 # clear 16 results registers +#endif + + .align 5 +.L11: # kr=4 + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + FETCH $0,(PREB) + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + LD b6,6*SIZE(B) + FETCH $0,(PREA) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + LD a7,7*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,7*SIZE(B) + +.L12: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,8*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,9*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,8*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,9*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,10*SIZE(A) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,10*SIZE(B) + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + LD a3,11*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + LD b3,11*SIZE(B) + +.L13: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,12*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,13*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,12*SIZE(B) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,13*SIZE(B) + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,14*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,16*SIZE # 4mr*4kr + LD b6,14*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,16*SIZE # 4nr*4kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L14: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + LD b0,0(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,16*SIZE + LD b1,1*SIZE(B) + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + bnez K,.L11 + LD b3,3*SIZE(B) + + +.L15: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP, 2 +#endif + beqz K,.L18 + nop + +.L16: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,8*SIZE # 4mr*2kr + LD b6,6*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,8*SIZE # 4nr*2kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L17: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,0*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,1*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREA,PREA,8*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + daddu PREB,PREB,8*SIZE + LD b3,3*SIZE(B) + + +.L18: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # 4mr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 4nr*kr + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COi += 4 + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + daddu PREB,BO,SPANB + + bnez M,.L10 + daddu CO4,CO4,4*SIZE + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) + MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) + MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + daddiu CO1,CO1, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 + nop +#endif + + + .align 3 +.L14_M2: + andi M, MCO, 2 # nr=4,mr=2 + beqz M,.L14_M1 + nop + +.L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 +#else + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 +#endif + dsra K,TEMP,2 + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 # clear 2*4=8 results registers + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + dsra K,KCO,2 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 + +#endif + +.L21: # nr=4,mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD a3,5*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,8*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,9*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,10*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,11*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddiu K,K,-1 + + MADD t11,t11,a2,b0 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b0 + LD a7,7*SIZE(A) + + MADD t12,t12,a2,b1 + LD b4,12*SIZE(B) + MADD t22,t22,a3,b1 + LD b5,13*SIZE(B) + + MADD t13,t13,a2,b2 + LD b6,14*SIZE(B) + MADD t23,t23,a3,b2 + LD b7,15*SIZE(B) + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + daddu A,A,8*SIZE # 2mr*4kr + daddu B,B,16*SIZE # 4nr*4kr + + MADD t11,t11,a6,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a6,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a7,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a6,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a7,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + + +.L25: +#ifndef TRMMKERNEL + andi K,KCO,2 # kr=2 +#else + andi K,TEMP,2 +#endif + beqz K,.L28 + nop + +.L26: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,4*SIZE # 2mr*2kr + daddu B,B,8*SIZE # 4nr*2kr + +.L27: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + +.L28: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write back part + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 + ST t23,1*SIZE(CO3) + daddu CO2,CO2,2*SIZE + + ST t14,0(CO4) + daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO4,CO4,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L14_M1: + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj + nop + +.L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + nop + beqz K,.L35 + nop + +#else + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + beqz K,.L35 + LD b3,3*SIZE(B) + +#endif + +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + + LD b6,6*SIZE(B) + LD b7,7*SIZE(B) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + LD a2, 2*SIZE(A) # a2 + MADD t11,t11,a1,b4 + + LD b0,8*SIZE(B) + LD b1,9*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,10*SIZE(B) + LD b3,11*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + LD a3, 3*SIZE(A) # a3 + MADD t11,t11,a2,b0 + daddiu K,K,-1 + + LD b4,12*SIZE(B) + LD b5,13*SIZE(B) + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr + + LD b6,14*SIZE(B) + LD b7,15*SIZE(B) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + + LD a0, 0*SIZE(A) # a0 + daddu B,B,16*SIZE # 4nr*4kr + MADD t11,t11,a3,b4 + + LD b0,0*SIZE(B) + MADD t12,t12,a3,b5 + LD b1,1*SIZE(B) + MADD t13,t13,a3,b6 + + LD b2,2*SIZE(B) + MADD t14,t14,a3,b7 + bnez K,.L31 + LD b3,3*SIZE(B) + + +.L35: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L38 + nop + +.L36: + LD a1,1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # mr*2kr + + LD b6,6*SIZE(B) + MADD t13,t13,a0,b2 + + LD b7,7*SIZE(B) + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE # 4nr*2kr + + +.L37: + LD a0,0(A) + MADD t11,t11,a1,b4 + + LD b0,0*SIZE(B) + LD b1,1*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + +.L38: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj + + .align 5 +.L0_N2: + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 + nop + +.L0_N2_Lb: + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + beqz M,.L12_M2 + nop + +.L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#endif + +.L41: # nr=2,mr=kr=4 + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + MADD t11,t11,a4,b4 + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,9*SIZE(A) + + MADD t12,t12,a4,b5 + LD b2,4*SIZE(B) + MADD t22,t22,a5,b5 + LD b3,5*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,10*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,11*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + MADD t11,t11,a0,b2 + LD a4,12*SIZE(A) + MADD t21,t21,a1,b2 + LD a5,13*SIZE(A) + + MADD t12,t12,a0,b3 + LD b6,6*SIZE(B) + MADD t22,t22,a1,b3 + LD b7,7*SIZE(B) + + MADD t31,t31,a2,b2 + LD a6,14*SIZE(A) + MADD t41,t41,a3,b2 + LD a7,15*SIZE(A) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + + daddu A,A,16*SIZE # 4mr*4kr + daddu B,B,8*SIZE # 2nr*4kr + +.L44: + MADD t11,t11,a4,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b6 + LD a1,1*SIZE(A) + + + MADD t12,t12,a4,b7 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b7 + LD b1,1*SIZE(B) + + daddiu K,K,-1 + daddu PREA,PREA,16*SIZE + + MADD t31,t31,a6,b6 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b6 + LD a3,3*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L48 + nop + +.L46: + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,3*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + + +.L48: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE + bnez M,.L40 + daddu CO2,CO2,4*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) + MUL t42, ALPHA, t42 + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L40 + nop +#endif + + + .align 3 +.L12_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L12_M1 + nop + +.L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#else + move B,BO + LD a0,0*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#endif + +.L51: # nr=2 mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + LD a5,3*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD b2,4*SIZE(B) + + MADD t12,t12,a4,b5 + LD a3,5*SIZE(A) + MADD t22,t22,a5,b5 + daddiu K,K,-1 + LD b3,5*SIZE(B) + + MADD t11,t11,a2,b2 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + LD b6,6*SIZE(B) + + MADD t12,t12,a2,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + LD a7,-1*SIZE(A) + MADD t22,t22,a3,b3 + LD b7,-1*SIZE(B) + + MADD t11,t11,a6,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b6 + LD b0,0*SIZE(B) + + MADD t12,t12,a6,b7 + LD a1,1*SIZE(A) + + MADD t22,t22,a7,b7 + bnez K,.L51 + LD b1,1*SIZE(B) + + +.L55: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L58 + nop + +.L56: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE # 2nr*2kr + LD a5,-1*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,-1*SIZE(B) + +.L57: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD b0,0*SIZE(B) + + MADD t12,t12,a4,b5 + LD a1,1*SIZE(A) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + +.L58: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) +#else + daddiu M, M, -1 + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L12_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop + nop + +.L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t22,t11 + beqz K,.L65 + nop + +#else + dsra K,KCO,2 + move B,BO # Reset B + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + beqz K,.L65 + MOV t22,t11 + +#endif + +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 + LD b4, 2*SIZE(B) + MADD t11,t11,a0,b0 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + + LD a2, 2*SIZE(A) # a3 + LD b2,4*SIZE(B) + MADD t11,t11,a4,b4 + + LD b3,5*SIZE(B) + MADD t12,t12,a4,b5 + + LD a6, 3*SIZE(A) # a4 + daddiu K,K,-1 + LD b6,6*SIZE(B) + MADD t11,t11,a2,b2 + + LD b7,7*SIZE(B) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + LD b0,0*SIZE(B) + MADD t11,t11,a6,b6 + + LD b1,1*SIZE(B) + bnez K,.L61 + MADD t12,t12,a6,b7 + + + +.L65: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L68 + nop + +.L66: + LD a4, 1*SIZE(A) # a1 + MADD t11,t11,a0,b0 + LD b4,2*SIZE(B) + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) # a0 + LD b0,0*SIZE(B) + MADD t11,t11,a4,b4 + + LD b1,1*SIZE(B) + MADD t12,t12,a4,b5 + + +.L68: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + +.L0_N2_Loop: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B + + + .align 5 +.L0_N1: + andi N,NCO,1 # nr = 1 + beqz N,.L999 + nop + + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + beqz M,.L11_M2 + daddu C,CO1,LDC + +.L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO # Reset B +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + LD a3,3*SIZE(A) + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 + beqz K,.L75 + nop +#else + move B, BO # Reset B + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + beqz K,.L75 + LD a3,3*SIZE(A) + +#endif + +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 + MADD t11,t11,a0,b0 + + LD a4, 4*SIZE(A) + MADD t21,t21,a1,b0 + + LD a5, 5*SIZE(A) + FETCH $0,(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + +.L72: + LD b2, 2*SIZE(B) # b2 + MADD t11,t11,a4,b4 + + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + + LD a1,9*SIZE(A) + FETCH $0,4*SIZE(PREA) + + LD a2,10*SIZE(A) + MADD t31,t31,a6,b4 + + LD a3,11*SIZE(A) + MADD t41,t41,a7,b4 + +.L73: + LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + + LD a4,12*SIZE(A) + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a5,13*SIZE(A) + MADD t21,t21,a1,b2 + + LD a6,14*SIZE(A) + FETCH $0,8*SIZE(PREA) + MADD t31,t31,a2,b2 + + LD a7,15*SIZE(A) + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + + LD a0,0*SIZE(A) + daddu PREA,PREA,16*SIZE + + LD a1,1*SIZE(A) + MADD t21,t21,a5,b6 + + LD a2,2*SIZE(A) + daddiu K,K,-1 + MADD t31,t31,a6,b6 + + LD a3,3*SIZE(A) + MADD t41,t41,a7,b6 + bnez K,.L71 + FETCH $0,-32(PREA) + + +.L75: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L78 + nop + +.L76: + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a4,4*SIZE(A) + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + LD a5,5*SIZE(A) + MADD t21,t21,a1,b0 + FETCH $0,0(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + MADD t11,t11,a4,b4 + + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + FETCH $0,4*SIZE(PREA) + + LD a1,1*SIZE(A) + MADD t31,t31,a6,b4 + + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + + LD a3,3*SIZE(A) + daddu PREA,PREA,8*SIZE + + + +.L78: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + bnez M,.L70 # M!=0 + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 + nop +#endif + + + .align 3 +.L11_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L11_M1 + nop + +.L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + + beqz K,.L85 + LD a1,1*SIZE(A) + +#endif + +.L81: # nr=1,mr=2,kr=4 + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + LD b2, 2*SIZE(B) + LD a2,4*SIZE(A) + MADD t11,t11,a4,b4 + LD a3,5*SIZE(A) + MADD t21,t21,a5,b4 + + LD b6, 3*SIZE(B) + LD a6,6*SIZE(A) + MADD t11,t11,a2,b2 + LD a7,7*SIZE(A) + MADD t21,t21,a3,b2 + + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD b0, 0*SIZE(B) + daddiu K,K,-1 + + LD a0,0*SIZE(A) + MADD t11,t11,a6,b6 + + LD a1,1*SIZE(A) + bnez K,.L81 + MADD t21,t21,a7,b6 + +.L85: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 + nop + +.L86: + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + LD b0,0(B) + LD a0,0*SIZE(A) + MADD t11,t11,a4,b4 + LD a1,1*SIZE(A) + MADD t21,t21,a5,b4 + + + +.L88: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + FETCH $0,0(CO1) + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L11_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L999 + nop + +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MTC $0,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + dsra K,KCO,2 + beqz K,.L95 + MTC $0,t11 +#endif + +.L91: # nr=mr=1,kr=4 + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) + MADD t11,t11,a4,b4 + + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) + MADD t11,t11,a2,b2 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MADD t11,t11,a6,b6 + + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L98 + nop + +.L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + MADD t11,t11,a4,b4 + +.L98: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + + +.L99: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + +#else + MUL t11, ALPHA, t11 + + ST t11, 0 * SIZE(CO1) +#endif + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/sgemm_kernel_loongson3b_4x4.S b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S new file mode 100644 index 000000000..4a8c9b0e4 --- /dev/null +++ b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S @@ -0,0 +1,2579 @@ +#define REALNAME ASMNAME +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define KCO $18 +#define MCO $19 +#define NCO $20 + +#define SPANB $21 +#define PREB $23 +#define PREA $24 +#define SPANA $25 + +#define ALPHA $f15 + +#if defined(TRMMKERNEL) +#define OFFSET $2 +#define KK $3 +#define TEMP $7 +#endif + +#define R8 8 +#define R9 9 +#define R14 14 +#define R15 15 +#define R16 16 +#define R17 17 + +#define t11 $f30 +#define t21 $f31 +#define t31 $f28 +#define t41 $f29 + +#define t12 $f26 +#define t22 $f27 +#define t32 $f24 +#define t42 $f25 + +#define t13 $f22 +#define t23 $f23 +#define t33 $f20 +#define t43 $f21 + +#define t14 $f18 +#define t24 $f19 +#define t34 $f16 +#define t44 $f17 + +#define c11 $f0 +#define c21 $f1 +#define c31 $f2 +#define c41 $f3 + +#define c12 $f4 +#define c22 $f5 +#define c32 $f6 +#define c42 $f7 + +#define c13 $f8 +#define c23 $f9 +#define c33 $f10 +#define c43 $f11 + +#define c14 $f12 +#define c24 $f13 +#define c34 $f14 +#define c44 $f0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define b0 $f8 +#define b1 $f9 +#define b2 $f10 +#define b3 $f11 +#define b4 $f12 +#define b5 $f13 +#define b6 $f14 +#define b7 $f15 + +#define F31 31 +#define F30 30 +#define F29 29 +#define F28 28 +#define F27 27 +#define F26 26 +#define F25 25 +#define F24 24 +#define F23 23 +#define F22 22 +#define F21 21 +#define F20 20 +#define F19 19 +#define F18 18 +#define F17 17 +#define F16 16 +#define F15 15 +#define F14 14 +#define F13 13 +#define F12 12 +#define F11 11 +#define F10 10 +#define F9 9 +#define F8 8 +#define F7 7 +#define F6 6 +#define F5 5 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 +#define F0 0 + + PROLOGUE + + daddiu $sp, $sp, -160 + sd $16, 0($sp) + sd $17, 8($sp) + sd $18, 16($sp) + sd $19, 24($sp) + sd $20, 32($sp) + sd $21, 40($sp) + sd $22, 48($sp) + ST $f24, 56($sp) + ST $f25, 64($sp) + ST $f26, 72($sp) + ST $f27, 80($sp) + ST $f28, 88($sp) + sd $23, 96($sp) + sd $24, 104($sp) + sd $25, 112($sp) + ST $f20,120($sp) + ST $f21,128($sp) + ST $f22,136($sp) + ST $f23,144($sp) + + + .align 5 +.L0_N4: # Loop N + ST ALPHA,152($sp) # Backup ALPHA + move MCO,M # Backup M + + move NCO,N # Backup N + move KCO,K # Backup K + + move AO,A # Backup A_addr + dsra N,NCO,2 # N=NCO/2 + + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte + dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 + +#if defined(TRMMKERNEL) + LDARG OFFSET,160($sp) # OFFSET is relate to the data part +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK,OFFSET +#endif + + move BO,B # Backup B_addr + beq N,$0,.L0_N2 # N=0,NCO<4 + dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte + +.L0_N4_Lb: # mr=4,nr=4 + move CO1,C + dsra M,MCO,2 # M=MCO/2 + + move A,AO # Reset A + daddu CO2,C,LDC + + daddu PREB,BO,SPANB # PreB point next panelB + daddu CO3,CO2,LDC + + daddu PREA,AO,SPANA + daddu CO4,CO3,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK,OFFSET +#endif + beqz M,.L14_M2 + daddu C,CO4,LDC # move C to next panel Cj + +.L10: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) +#else + dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K # move A B to data part + daddu B,BO,TEMP +#endif + + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK # temp is the length of the data part +#elif defined(LEFT) + daddiu TEMP, KK, 4 # S=L,U=L +#else + daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part +#endif + dsra K,TEMP,2 # K=KCO/2 + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 + +#else + move B,BO # Reset B + MTC $0,t11 # GEMM part NR=4,MR=4 + LD a0,0(A) + + MOV t21,t11 + MOV t31,t11 + LD a1,1*SIZE(A) + + MOV t41,t11 + MOV t12,t11 + LD b0,0(B) + + MOV t22,t11 + MOV t32,t11 + LD b1,1*SIZE(B) + + MOV t42,t11 + dsra K,KCO,2 # K=KCO/2 + LD a2,2*SIZE(A) + + MOV t13,t11 + MOV t23,t11 + LD b2,2*SIZE(B) + + MOV t33,t11 + MOV t43,t11 + LD a3,3*SIZE(A) + + MOV t14,t11 + MOV t24,t11 + LD b3,3*SIZE(B) + + MOV t34,t11 + beqz K,.L15 + MOV t44,t11 # clear 16 results registers +#endif + + .align 5 +.L11: # kr=4 + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + FETCH $0,(PREB) + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + LD b6,6*SIZE(B) + FETCH $0,(PREA) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + LD a7,7*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,7*SIZE(B) + +.L12: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,8*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,9*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,8*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,9*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,10*SIZE(A) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,10*SIZE(B) + + FETCH $0,4*SIZE(PREA) + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + LD a3,11*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + LD b3,11*SIZE(B) + +.L13: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,12*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,13*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,12*SIZE(B) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,13*SIZE(B) + + FETCH $0,8*SIZE(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,14*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,16*SIZE # 4mr*4kr + LD b6,14*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,16*SIZE # 4nr*4kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L14: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + daddiu K,K,-1 + LD b0,0(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,16*SIZE + LD b1,1*SIZE(B) + + FETCH $0,12*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREB,PREB,16*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + bnez K,.L11 + LD b3,3*SIZE(B) + + +.L15: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP, 2 +#endif + beqz K,.L18 + nop + +.L16: + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + LD a4,4*SIZE(A) + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + LD a5,5*SIZE(A) + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + LD b4,4*SIZE(B) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + LD b5,5*SIZE(B) + + FETCH $0,0(PREB) + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + LD a6,6*SIZE(A) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,8*SIZE # 4mr*2kr + LD b6,6*SIZE(B) + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + daddu B,B,8*SIZE # 4nr*2kr + LD a7,-1*SIZE(A) + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + LD b7,-1*SIZE(B) + +.L17: + MADD t11,t11,a4,b4 + MADD t21,t21,a5,b4 + LD a0,0*SIZE(A) + + MADD t12,t12,a4,b5 + MADD t22,t22,a5,b5 + LD a1,1*SIZE(A) + + MADD t31,t31,a6,b4 + MADD t41,t41,a7,b4 + LD b0,0*SIZE(B) + + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + LD b1,1*SIZE(B) + + FETCH $0,4*SIZE(PREB) + MADD t13,t13,a4,b6 + MADD t23,t23,a5,b6 + LD a2,2*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + LD b2,2*SIZE(B) + + MADD t33,t33,a6,b6 + MADD t43,t43,a7,b6 + daddu PREA,PREA,8*SIZE + LD a3,3*SIZE(A) + + MADD t34,t34,a6,b7 + MADD t44,t44,a7,b7 + daddu PREB,PREB,8*SIZE + LD b3,3*SIZE(B) + + +.L18: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L19 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREB) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # 4mr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,4*SIZE # 4nr*kr + + FETCH $0,0(PREA) + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu PREB,PREB,4*SIZE + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + daddu PREA,PREA,4*SIZE + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t33,t33,a2,b2 + MADD t43,t43,a3,b2 + + MADD t34,t34,a2,b3 + MADD t44,t44,a3,b3 + +.L19: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part + LD c21,1*SIZE(CO1) # get 16 C + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + LD c13,0(CO3) + MADD t12,c12,t12,ALPHA + LD c23,1*SIZE(CO3) + MADD t22,c22,t22,ALPHA + LD c33,2*SIZE(CO3) + MADD t32,c32,t32,ALPHA + LD c43,3*SIZE(CO3) + MADD t42,c42,t42,ALPHA + + LD c14,0(CO4) + MADD t13,c13,t13,ALPHA + LD c24,1*SIZE(CO4) + MADD t23,c23,t23,ALPHA + LD c34,2*SIZE(CO4) + MADD t33,c33,t33,ALPHA + LD c44,3*SIZE(CO4) + MADD t43,c43,t43,ALPHA + + ST t11,0(CO1) + MADD t14,c14,t14,ALPHA + ST t21,1*SIZE(CO1) + MADD t24,c24,t24,ALPHA + ST t31,2*SIZE(CO1) + MADD t34,c34,t34,ALPHA + ST t41,3*SIZE(CO1) + MADD t44,c44,t44,ALPHA + daddiu M,M,-1 # M-- + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + ST t13,0(CO3) + ST t23,1*SIZE(CO3) + ST t33,2*SIZE(CO3) + ST t43,3*SIZE(CO3) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + FETCH $0,8*SIZE(CO3) + FETCH $0,8*SIZE(CO4) + + ST t14,0(CO4) + daddu CO1,CO1,4*SIZE # COi += 4 + ST t24,1*SIZE(CO4) + daddu CO2,CO2,4*SIZE + ST t34,2*SIZE(CO4) + daddu CO3,CO3,4*SIZE + ST t44,3*SIZE(CO4) + daddu PREB,BO,SPANB + + bnez M,.L10 + daddu CO4,CO4,4*SIZE + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t31, 2 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t41, 3 * SIZE(CO1) + MUL t42, ALPHA, t42 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + ST t32, 2 * SIZE(CO2) + MUL t33, ALPHA, t33 + ST t42, 3 * SIZE(CO2) + MUL t43, ALPHA, t43 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + ST t33, 2 * SIZE(CO3) + MUL t34, ALPHA, t34 + ST t43, 3 * SIZE(CO3) + MUL t44, ALPHA, t44 + + ST t14, 0 * SIZE(CO4) + daddiu M,M,-1 # M-- + ST t24, 1 * SIZE(CO4) + ST t34, 2 * SIZE(CO4) + ST t44, 3 * SIZE(CO4) + daddiu CO1,CO1, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE + daddiu CO3,CO3, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,4*SIZE(CO3) + FETCH $0,4*SIZE(CO4) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP, -4 +#else + daddiu TEMP,TEMP, -4 +#endif + dsll K,TEMP,2 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + daddu A,A,K # mov A to the end of panel Ai + daddu B,B,TEMP # mov B to the end of panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK,4 +#endif + bnez M,.L10 + nop +#endif + + + .align 3 +.L14_M2: + andi M, MCO, 2 # nr=4,mr=2 + beqz M,.L14_M1 + nop + +.L20: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll TEMP,KK,2 + BASE_SHIFT # nr=4 + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 +#else + daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 +#endif + dsra K,TEMP,2 + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 # clear 2*4=8 results registers + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t22,t11 + dsra K,KCO,2 + LD b2,2*SIZE(B) + + MOV t13,t11 + MOV t23,t11 + LD b3,3*SIZE(B) + + MOV t14,t11 + beqz K,.L25 + MOV t24,t11 + +#endif + +.L21: # nr=4,mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD a3,5*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,8*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,9*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,10*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,11*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + daddiu K,K,-1 + + MADD t11,t11,a2,b0 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b0 + LD a7,7*SIZE(A) + + MADD t12,t12,a2,b1 + LD b4,12*SIZE(B) + MADD t22,t22,a3,b1 + LD b5,13*SIZE(B) + + MADD t13,t13,a2,b2 + LD b6,14*SIZE(B) + MADD t23,t23,a3,b2 + LD b7,15*SIZE(B) + + MADD t14,t14,a2,b3 + MADD t24,t24,a3,b3 + daddu A,A,8*SIZE # 2mr*4kr + daddu B,B,16*SIZE # 4nr*4kr + + MADD t11,t11,a6,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a6,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a7,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a6,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a7,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a6,b7 + bnez K,.L21 + MADD t24,t24,a7,b7 + + +.L25: +#ifndef TRMMKERNEL + andi K,KCO,2 # kr=2 +#else + andi K,TEMP,2 +#endif + beqz K,.L28 + nop + +.L26: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,3*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,4*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,5*SIZE(B) + + MADD t13,t13,a0,b2 + LD b6,6*SIZE(B) + MADD t23,t23,a1,b2 + LD b7,7*SIZE(B) + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + daddu A,A,4*SIZE # 2mr*2kr + daddu B,B,8*SIZE # 4nr*2kr + +.L27: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t13,t13,a4,b6 + LD b2,2*SIZE(B) + MADD t23,t23,a5,b6 + LD b3,3*SIZE(B) + + MADD t14,t14,a4,b7 + MADD t24,t24,a5,b7 + + +.L28: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L29 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # 2mr*kr + daddu B,B,4*SIZE # 4nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + MADD t13,t13,a0,b2 + MADD t23,t23,a1,b2 + + MADD t14,t14,a0,b3 + MADD t24,t24,a1,b3 + +.L29: # Write Back to C +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write back part + LD c21,1*SIZE(CO1) + + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + LD c13,0(CO3) + MADD t11,c11,t11,ALPHA + LD c23,1*SIZE(CO3) + MADD t21,c21,t21,ALPHA + + LD c14,0(CO4) + MADD t12,c12,t12,ALPHA + LD c24,1*SIZE(CO4) + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + MADD t13,c13,t13,ALPHA + ST t21,1*SIZE(CO1) + MADD t23,c23,t23,ALPHA + + ST t12,0(CO2) + MADD t14,c14,t14,ALPHA + ST t22,1*SIZE(CO2) + MADD t24,c24,t24,ALPHA + + ST t13,0(CO3) + daddu CO1,CO1,2*SIZE # COi += 2 + ST t23,1*SIZE(CO3) + daddu CO2,CO2,2*SIZE + + ST t14,0(CO4) + daddu CO3,CO3,2*SIZE + ST t24,1*SIZE(CO4) + daddu CO4,CO4,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#else + MUL t11, ALPHA, t11 # TRMM write back part + MUL t21, ALPHA, t21 + + ST t11, 0 * SIZE(CO1) + MUL t12, ALPHA, t12 + ST t21, 1 * SIZE(CO1) + MUL t22, ALPHA, t22 + + ST t12, 0 * SIZE(CO2) + MUL t13, ALPHA, t13 + ST t22, 1 * SIZE(CO2) + MUL t23, ALPHA, t23 + + ST t13, 0 * SIZE(CO3) + MUL t14, ALPHA, t14 + ST t23, 1 * SIZE(CO3) + MUL t24, ALPHA, t24 + + ST t14, 0 * SIZE(CO4) + ST t24, 1 * SIZE(CO4) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,0(CO3) + FETCH $0,0(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP,KCO,KK +#ifdef LEFT + daddiu TEMP,TEMP,-2 +#else + daddiu TEMP,TEMP,-4 +#endif + dsll K,TEMP,1 + BASE_SHIFT + dsll TEMP,TEMP,2 + BASE_SHIFT + + daddu A,A,K # move A to next panel Ai + daddu B,B,TEMP # move B to next panel Bj +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L14_M1: + andi M,MCO,1 # mr=1 + beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj + nop + +.L30: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, BASE_SHIFT + dsll TEMP,KK,2 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + LD b3,3*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra K,TEMP, 2 + nop + beqz K,.L35 + nop + +#else + move B,BO # Reset B, GEMM part + dsra K,KCO,2 # K=KCO/2 + LD a0, 0 * SIZE(A) # a0 + + MTC $0,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + + MOV t13,t11 + LD b2,2*SIZE(B) + + MOV t14,t11 + beqz K,.L35 + LD b3,3*SIZE(B) + +#endif + +.L31: # nr=4,mr=1,kr=4 + LD a1, 1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + + LD b6,6*SIZE(B) + LD b7,7*SIZE(B) + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + + LD a2, 2*SIZE(A) # a2 + MADD t11,t11,a1,b4 + + LD b0,8*SIZE(B) + LD b1,9*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,10*SIZE(B) + LD b3,11*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + LD a3, 3*SIZE(A) # a3 + MADD t11,t11,a2,b0 + daddiu K,K,-1 + + LD b4,12*SIZE(B) + LD b5,13*SIZE(B) + MADD t12,t12,a2,b1 + daddu A,A,4*SIZE # 1mr*4kr + + LD b6,14*SIZE(B) + LD b7,15*SIZE(B) + MADD t13,t13,a2,b2 + MADD t14,t14,a2,b3 + + LD a0, 0*SIZE(A) # a0 + daddu B,B,16*SIZE # 4nr*4kr + MADD t11,t11,a3,b4 + + LD b0,0*SIZE(B) + MADD t12,t12,a3,b5 + LD b1,1*SIZE(B) + MADD t13,t13,a3,b6 + + LD b2,2*SIZE(B) + MADD t14,t14,a3,b7 + bnez K,.L31 + LD b3,3*SIZE(B) + + +.L35: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L38 + nop + +.L36: + LD a1,1*SIZE(A) # load a1 + MADD t11,t11,a0,b0 + + LD b4,4*SIZE(B) + LD b5,5*SIZE(B) + MADD t12,t12,a0,b1 + daddu A,A,2*SIZE # mr*2kr + + LD b6,6*SIZE(B) + MADD t13,t13,a0,b2 + + LD b7,7*SIZE(B) + MADD t14,t14,a0,b3 + daddu B,B,8*SIZE # 4nr*2kr + + +.L37: + LD a0,0(A) + MADD t11,t11,a1,b4 + + LD b0,0*SIZE(B) + LD b1,1*SIZE(B) + MADD t12,t12,a1,b5 + + LD b2,2*SIZE(B) + LD b3,3*SIZE(B) + MADD t13,t13,a1,b6 + MADD t14,t14,a1,b7 + + +.L38: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L39 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE + daddu B,B,4*SIZE + + MADD t13,t13,a0,b2 + MADD t14,t14,a0,b3 + +.L39: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) + LD c12,0(CO2) + LD c13,0(CO3) + LD c14,0(CO4) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + MADD t13,c13,t13,ALPHA + MADD t14,c14,t14,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + ST t13,0(CO3) + ST t14,0(CO4) +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + MUL t13, ALPHA, t13 + MUL t14, ALPHA, t14 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + ST t13, 0 * SIZE(CO3) + ST t14, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll K,TEMP, BASE_SHIFT + dsll TEMP,TEMP, 2 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + + .align 3 +.L0_N4_Loop: # mc finished + daddiu N,N,-1 # N-- +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK,4 +#endif + bnez N,.L0_N4_Lb + move BO,B # Set BO point to next panel Bj + + .align 5 +.L0_N2: + andi N,NCO,2 # nr = 2 + beqz N,.L0_N1 + nop + +.L0_N2_Lb: + move CO1,C + daddu CO2,C,LDC + + dsra M,MCO,2 + move A,AO # Reset A + + daddu PREA,AO,SPANA + daddu C,CO2,LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + beqz M,.L12_M2 + nop + +.L40: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K,KK, 2 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT + + daddu A,A,K + daddu B,BO,TEMP +#endif + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP,KCO,KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#else + move B,BO # Reset B + LD a0,0*SIZE(A) + MTC $0,t11 # gemm part + LD a1,1*SIZE(A) + + MOV t21,t11 + LD b0,0*SIZE(B) + MOV t31,t11 + LD b1,1*SIZE(B) + + MOV t41,t11 + LD a2,2*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a3,3*SIZE(A) + + MOV t12,t11 + MOV t22,t11 + + MOV t32,t11 + beqz K,.L45 + MOV t42,t11 + +#endif + +.L41: # nr=2,mr=kr=4 + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,(PREA) + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L42: + MADD t11,t11,a4,b4 + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,9*SIZE(A) + + MADD t12,t12,a4,b5 + LD b2,4*SIZE(B) + MADD t22,t22,a5,b5 + LD b3,5*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,10*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,11*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + +.L43: + MADD t11,t11,a0,b2 + LD a4,12*SIZE(A) + MADD t21,t21,a1,b2 + LD a5,13*SIZE(A) + + MADD t12,t12,a0,b3 + LD b6,6*SIZE(B) + MADD t22,t22,a1,b3 + LD b7,7*SIZE(B) + + MADD t31,t31,a2,b2 + LD a6,14*SIZE(A) + MADD t41,t41,a3,b2 + LD a7,15*SIZE(A) + + FETCH $0,8*SIZE(PREA) + MADD t32,t32,a2,b3 + MADD t42,t42,a3,b3 + + daddu A,A,16*SIZE # 4mr*4kr + daddu B,B,8*SIZE # 2nr*4kr + +.L44: + MADD t11,t11,a4,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b6 + LD a1,1*SIZE(A) + + + MADD t12,t12,a4,b7 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b7 + LD b1,1*SIZE(B) + + daddiu K,K,-1 + daddu PREA,PREA,16*SIZE + + MADD t31,t31,a6,b6 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b6 + LD a3,3*SIZE(A) + + FETCH $0,-4*SIZE(PREA) + MADD t32,t32,a6,b7 + bnez K,.L41 + MADD t42,t42,a7,b7 + + +.L45: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L48 + nop + +.L46: + MADD t11,t11,a0,b0 + LD a4,4*SIZE(A) + MADD t21,t21,a1,b0 + LD a5,5*SIZE(A) + + MADD t12,t12,a0,b1 + LD b4,2*SIZE(B) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t31,t31,a2,b0 + LD a6,6*SIZE(A) + MADD t41,t41,a3,b0 + LD a7,7*SIZE(A) + + FETCH $0,0(PREA) + MADD t32,t32,a2,b1 + daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 + + MADD t42,t42,a3,b1 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L47: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD a1,1*SIZE(A) + + MADD t12,t12,a4,b5 + LD b0,0*SIZE(B) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + + MADD t31,t31,a6,b4 + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + LD a3,3*SIZE(A) + + FETCH $0,4*SIZE(PREA) + MADD t32,t32,a6,b5 + MADD t42,t42,a7,b5 + daddu PREA,PREA,8*SIZE + + + +.L48: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L49 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + daddu B,B,2*SIZE + daddu PREA,PREA,4*SIZE + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + + MADD t32,t32,a2,b1 + MADD t42,t42,a3,b1 + +.L49: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # gemm write back part Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + LD c12,0(CO2) + MADD t11,c11,t11,ALPHA + LD c22,1*SIZE(CO2) + MADD t21,c21,t21,ALPHA + LD c32,2*SIZE(CO2) + MADD t31,c31,t31,ALPHA + LD c42,3*SIZE(CO2) + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + MADD t12,c12,t12,ALPHA + ST t21,1*SIZE(CO1) + MADD t22,c22,t22,ALPHA + ST t31,2*SIZE(CO1) + MADD t32,c32,t32,ALPHA + ST t41,3*SIZE(CO1) + MADD t42,c42,t42,ALPHA + daddiu M,M,-1 + + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + ST t32,2*SIZE(CO2) + ST t42,3*SIZE(CO2) + + FETCH $0,4*SIZE(CO1) + FETCH $0,4*SIZE(CO2) + FETCH $0,8*SIZE(CO1) + FETCH $0,8*SIZE(CO2) + + daddu CO1,CO1,4*SIZE + bnez M,.L40 + daddu CO2,CO2,4*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + MUL t12, ALPHA, t12 + ST t11, 0 * SIZE(CO1) + MUL t22, ALPHA, t22 + ST t21, 1 * SIZE(CO1) + MUL t32, ALPHA, t32 + ST t31, 2 * SIZE(CO1) + MUL t42, ALPHA, t42 + ST t41, 3 * SIZE(CO1) + + ST t12, 0 * SIZE(CO2) + daddiu M,M,-1 + ST t22, 1 * SIZE(CO2) + ST t32, 2 * SIZE(CO2) + ST t42, 3 * SIZE(CO2) + + daddiu CO1,CO1, 4*SIZE + daddiu CO2,CO2, 4*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + FETCH $0,4(CO1) + FETCH $0,4(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll K,TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A,A,K + daddu B,B,TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L40 + nop +#endif + + + .align 3 +.L12_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L12_M1 + nop + +.L50: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO +#else + dsll K, KK, 1 + BASE_SHIFT #mr=2 + dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#else + move B,BO + LD a0,0*SIZE(A) + dsra K,KCO,2 # K=KCO/2 + LD a1,1*SIZE(A) + + MTC $0,t11 + LD b0,0*SIZE(B) + MOV t21,t11 + LD b1,1*SIZE(B) + + MOV t12,t11 + beqz K,.L55 + MOV t22,t11 + +#endif + +.L51: # nr=2 mr=2,kr=4 + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + LD a5,3*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,3*SIZE(B) + + MADD t11,t11,a4,b4 + LD a2,4*SIZE(A) + MADD t21,t21,a5,b4 + LD b2,4*SIZE(B) + + MADD t12,t12,a4,b5 + LD a3,5*SIZE(A) + MADD t22,t22,a5,b5 + daddiu K,K,-1 + LD b3,5*SIZE(B) + + MADD t11,t11,a2,b2 + LD a6,6*SIZE(A) + MADD t21,t21,a3,b2 + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + LD b6,6*SIZE(B) + + MADD t12,t12,a2,b3 + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE + LD a7,-1*SIZE(A) + MADD t22,t22,a3,b3 + LD b7,-1*SIZE(B) + + MADD t11,t11,a6,b6 + LD a0,0*SIZE(A) + MADD t21,t21,a7,b6 + LD b0,0*SIZE(B) + + MADD t12,t12,a6,b7 + LD a1,1*SIZE(A) + + MADD t22,t22,a7,b7 + bnez K,.L51 + LD b1,1*SIZE(B) + + +.L55: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L58 + nop + +.L56: + MADD t11,t11,a0,b0 + LD a4,2*SIZE(A) + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + LD b4,2*SIZE(B) + + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE # 2nr*2kr + LD a5,-1*SIZE(A) + MADD t22,t22,a1,b1 + LD b5,-1*SIZE(B) + +.L57: + MADD t11,t11,a4,b4 + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + LD b0,0*SIZE(B) + + MADD t12,t12,a4,b5 + LD a1,1*SIZE(A) + MADD t22,t22,a5,b5 + LD b1,1*SIZE(B) + +.L58: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP, 1 +#endif + beqz K,.L59 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE # 2nr*kr + + MADD t12,t12,a0,b1 + MADD t22,t22,a1,b1 + + +.L59: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # write gemm part back Fetch 16 C + LD c21,1*SIZE(CO1) + LD c12,0(CO2) + LD c22,1*SIZE(CO2) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t12,c12,t12,ALPHA + MADD t22,c22,t22,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t12,0(CO2) + ST t22,1*SIZE(CO2) + + daddu CO1,CO1,2*SIZE + daddu CO2,CO2,2*SIZE + + FETCH $0,0(CO1) + FETCH $0,0(CO2) +#else + daddiu M, M, -1 + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t12, ALPHA, t12 + MUL t22, ALPHA, t22 + + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) + ST t12, -2 * SIZE(CO2) + ST t22, -1 * SIZE(CO2) + + FETCH $0,0(CO1) + FETCH $0,0(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L12_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L0_N2_Loop + nop + +.L60: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B,BO # Reset B +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra K,TEMP,2 + MOV t22,t11 + beqz K,.L65 + nop + +#else + dsra K,KCO,2 + move B,BO # Reset B + LD a0,0*SIZE(A) + + MTC $0,t11 + MOV t21,t11 + LD b0,0*SIZE(B) + + MOV t12,t11 + LD b1,1*SIZE(B) + beqz K,.L65 + MOV t22,t11 + +#endif + +.L61: # nr=2,mr=1,kr=4 + LD a4, 1*SIZE(A) # a2 + LD b4, 2*SIZE(B) + MADD t11,t11,a0,b0 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + + LD a2, 2*SIZE(A) # a3 + LD b2,4*SIZE(B) + MADD t11,t11,a4,b4 + + LD b3,5*SIZE(B) + MADD t12,t12,a4,b5 + + LD a6, 3*SIZE(A) # a4 + daddiu K,K,-1 + LD b6,6*SIZE(B) + MADD t11,t11,a2,b2 + + LD b7,7*SIZE(B) + MADD t12,t12,a2,b3 + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE + + LD b0,0*SIZE(B) + MADD t11,t11,a6,b6 + + LD b1,1*SIZE(B) + bnez K,.L61 + MADD t12,t12,a6,b7 + + + +.L65: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L68 + nop + +.L66: + LD a4, 1*SIZE(A) # a1 + MADD t11,t11,a0,b0 + LD b4,2*SIZE(B) + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 + + LD b5,3*SIZE(B) + MADD t12,t12,a0,b1 + daddu B,B,4*SIZE + +.L67: + LD a0,0(A) # a0 + LD b0,0*SIZE(B) + MADD t11,t11,a4,b4 + + LD b1,1*SIZE(B) + MADD t12,t12,a4,b5 + + +.L68: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L69 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t12,t12,a0,b1 + daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 + daddu B,B,2*SIZE + + +.L69: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c12,0(CO2) + + MADD t11,c11,t11,ALPHA + MADD t12,c12,t12,ALPHA + + ST t11,0(CO1) + ST t12,0(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#else + MUL t11, ALPHA, t11 + MUL t12, ALPHA, t12 + + ST t11, 0 * SIZE(CO1) + ST t12, 0 * SIZE(CO2) + + daddu CO1,CO1,1*SIZE + daddu CO2,CO2,1*SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll K, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + +.L0_N2_Loop: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move BO, B + + + .align 5 +.L0_N1: + andi N,NCO,1 # nr = 1 + beqz N,.L999 + nop + + move CO1,C + dsra M,MCO,2 + + move A,AO # Reset A + daddu PREA,AO,SPANA +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + beqz M,.L11_M2 + daddu C,CO1,LDC + +.L70: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO # Reset B +#else + dsll K, KK, 2 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + LD a3,3*SIZE(A) + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 4 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 + beqz K,.L75 + nop +#else + move B, BO # Reset B + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + LD a0,0*SIZE(A) + MOV t21,t11 + LD a1,1*SIZE(A) + + MOV t31,t11 + LD a2,2*SIZE(A) + MOV t41,t11 + beqz K,.L75 + LD a3,3*SIZE(A) + +#endif + +.L71: # nr=1,mr=kr=4 + LD b4, 1*SIZE(B) # b1 + MADD t11,t11,a0,b0 + + LD a4, 4*SIZE(A) + MADD t21,t21,a1,b0 + + LD a5, 5*SIZE(A) + FETCH $0,(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + +.L72: + LD b2, 2*SIZE(B) # b2 + MADD t11,t11,a4,b4 + + LD a0,8*SIZE(A) + MADD t21,t21,a5,b4 + + LD a1,9*SIZE(A) + FETCH $0,4*SIZE(PREA) + + LD a2,10*SIZE(A) + MADD t31,t31,a6,b4 + + LD a3,11*SIZE(A) + MADD t41,t41,a7,b4 + +.L73: + LD b6, 3*SIZE(B) + MADD t11,t11,a0,b2 + + LD a4,12*SIZE(A) + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a5,13*SIZE(A) + MADD t21,t21,a1,b2 + + LD a6,14*SIZE(A) + FETCH $0,8*SIZE(PREA) + MADD t31,t31,a2,b2 + + LD a7,15*SIZE(A) + MADD t41,t41,a3,b2 + daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE + +.L74: + LD b0, 0*SIZE(B) + MADD t11,t11,a4,b6 + + LD a0,0*SIZE(A) + daddu PREA,PREA,16*SIZE + + LD a1,1*SIZE(A) + MADD t21,t21,a5,b6 + + LD a2,2*SIZE(A) + daddiu K,K,-1 + MADD t31,t31,a6,b6 + + LD a3,3*SIZE(A) + MADD t41,t41,a7,b6 + bnez K,.L71 + FETCH $0,-32(PREA) + + +.L75: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L78 + nop + +.L76: + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a4,4*SIZE(A) + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 + + LD a5,5*SIZE(A) + MADD t21,t21,a1,b0 + FETCH $0,0(PREA) + + LD a6,6*SIZE(A) + MADD t31,t31,a2,b0 + + LD a7,7*SIZE(A) + MADD t41,t41,a3,b0 + daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE + +.L77: + LD b0,0(B) + MADD t11,t11,a4,b4 + + LD a0,0*SIZE(A) + MADD t21,t21,a5,b4 + FETCH $0,4*SIZE(PREA) + + LD a1,1*SIZE(A) + MADD t31,t31,a6,b4 + + LD a2,2*SIZE(A) + MADD t41,t41,a7,b4 + + LD a3,3*SIZE(A) + daddu PREA,PREA,8*SIZE + + + +.L78: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L79 + LD ALPHA,152($sp) # Get ALPHA + + FETCH $0,0(PREA) + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 + + MADD t31,t31,a2,b0 + MADD t41,t41,a3,b0 + daddu B,B,1*SIZE + daddu PREA,PREA,4*SIZE + + +.L79: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + LD c31,2*SIZE(CO1) + LD c41,3*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + MADD t31,c31,t31,ALPHA + MADD t41,c41,t41,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + daddiu M,M,-1 # M-- + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + bnez M,.L70 # M!=0 + daddu CO1,CO1,4*SIZE # COx += 4*8Byte +#else + daddiu M,M,-1 # M-- + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + MUL t31, ALPHA, t31 + MUL t41, ALPHA, t41 + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + ST t31,2*SIZE(CO1) + ST t41,3*SIZE(CO1) + + FETCH $0,4*SIZE(CO1) + FETCH $0,8*SIZE(CO1) + + daddu CO1,CO1,4*SIZE +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -4 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 2 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A,K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 4 +#endif + bnez M,.L70 + nop +#endif + + + .align 3 +.L11_M2: + andi M,MCO,2 # mr = 2 + beqz M,.L11_M1 + nop + +.L80: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + LD a1,1*SIZE(A) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra K,TEMP,2 # K=KCO/2 + beqz K,.L85 + nop +#else + move B, BO + dsra K,KCO,2 + LD b0, 0*SIZE(B) + + MTC $0,t11 + MOV t21,t11 + LD a0,0*SIZE(A) + + beqz K,.L85 + LD a1,1*SIZE(A) + +#endif + +.L81: # nr=1,mr=2,kr=4 + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + LD b2, 2*SIZE(B) + LD a2,4*SIZE(A) + MADD t11,t11,a4,b4 + LD a3,5*SIZE(A) + MADD t21,t21,a5,b4 + + LD b6, 3*SIZE(B) + LD a6,6*SIZE(A) + MADD t11,t11,a2,b2 + LD a7,7*SIZE(A) + MADD t21,t21,a3,b2 + + daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD b0, 0*SIZE(B) + daddiu K,K,-1 + + LD a0,0*SIZE(A) + MADD t11,t11,a6,b6 + + LD a1,1*SIZE(A) + bnez K,.L81 + MADD t21,t21,a7,b6 + +.L85: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L88 + nop + +.L86: + LD b4, 1*SIZE(B) + LD a4,2*SIZE(A) + MADD t11,t11,a0,b0 + LD a5,3*SIZE(A) + MADD t21,t21,a1,b0 + + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + + LD b0,0(B) + LD a0,0*SIZE(A) + MADD t11,t11,a4,b4 + LD a1,1*SIZE(A) + MADD t21,t21,a5,b4 + + + +.L88: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L89 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + MADD t21,t21,a1,b0 + daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 + daddu B,B,1*SIZE + + +.L89: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + LD c21,1*SIZE(CO1) + + MADD t11,c11,t11,ALPHA + MADD t21,c21,t21,ALPHA + + ST t11,0(CO1) + ST t21,1*SIZE(CO1) + + FETCH $0,2*SIZE(CO1) + + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + +#else + daddu CO1,CO1,2*SIZE # COx += 2*8Byte + MUL t11, ALPHA, t11 + MUL t21, ALPHA, t21 + + FETCH $0,0(CO1) + ST t11, -2 * SIZE(CO1) + ST t21, -1 * SIZE(CO1) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, KCO, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll K, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, B, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + + .align 3 +.L11_M1: + andi M,MCO,1 # mr = 1 + beqz M,.L999 + nop + +.L90: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B, BO +#else + dsll K, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu A, A, K + daddu B, BO, TEMP +#endif + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MTC $0,t11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, KCO, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra K, TEMP, 2 + beqz K,.L95 + nop + +#else + move B, BO + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + dsra K,KCO,2 + beqz K,.L95 + MTC $0,t11 +#endif + +.L91: # nr=mr=1,kr=4 + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + + LD a2, 2*SIZE(A) + LD b2, 2*SIZE(B) + MADD t11,t11,a4,b4 + + LD a6, 3*SIZE(A) + LD b6, 3*SIZE(B) + MADD t11,t11,a2,b2 + + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 + daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 + + LD a0, 0*SIZE(A) + LD b0, 0*SIZE(B) + MADD t11,t11,a6,b6 + + daddiu K,K,-1 + bnez K,.L91 + nop + +.L95: # kr=2 +#ifndef TRMMKERNEL + andi K,KCO,2 +#else + andi K,TEMP,2 +#endif + beqz K,.L98 + nop + +.L96: + LD a4, 1*SIZE(A) + LD b4, 1*SIZE(B) + MADD t11,t11,a0,b0 + daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 + daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 + + LD b0,0(B) + LD a0,0(A) + MADD t11,t11,a4,b4 + +.L98: # kr=1 +#ifndef TRMMKERNEL + andi K,KCO,1 +#else + andi K,TEMP,1 +#endif + beqz K,.L99 + LD ALPHA,152($sp) # Get ALPHA + + MADD t11,t11,a0,b0 + + +.L99: # Write Back +#ifndef TRMMKERNEL + LD c11,0(CO1) # Fetch 16 C + MADD t11,c11,t11,ALPHA + ST t11,0(CO1) + +#else + MUL t11, ALPHA, t11 + + ST t11, 0 * SIZE(CO1) +#endif + + +.L999: # End + ld $16, 0($sp) + ld $17, 8($sp) + ld $18, 16($sp) + ld $19, 24($sp) + ld $20, 32($sp) + ld $21, 40($sp) + ld $22, 48($sp) + LD $f24, 56($sp) + LD $f25, 64($sp) + LD $f26, 72($sp) + LD $f27, 80($sp) + LD $f28, 88($sp) + ld $23, 96($sp) + ld $24, 104($sp) + ld $25, 112($sp) + LD $f20,120($sp) + LD $f21,128($sp) + LD $f22,136($sp) + LD $f23,144($sp) + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/zgemm_kernel_loongson3b_2x2.S b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S new file mode 100644 index 000000000..5ded7aed0 --- /dev/null +++ b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S @@ -0,0 +1,1468 @@ +#define ASSEMBLER +#include "common.h" + +#define FETCH ld +#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) +#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) + + +#define STACKSIZE 160 +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define R12 12 +#define R13 13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define PREA $16 +#define PREB $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 + +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define b5 $f12 +#define b6 $f13 +#define b7 $f15 +#define b8 $f16 + +#define c11 $f14 +#define c12 $f17 +#define c13 $f18 +#define c14 $f19 +#define c21 $f20 +#define c22 $f21 +#define c23 $f22 +#define c24 $f23 +#define c31 $f24 +#define c32 $f25 +#define c33 $f26 +#define c34 $f27 +#define c41 $f28 +#define c42 $f29 +#define c43 $f30 +#define c44 $f31 + +#define F0 0 +#define F1 1 +#define F2 2 +#define F3 3 +#define F4 4 +#define F5 5 +#define F6 6 +#define F7 7 +#define F8 8 +#define F9 9 +#define F10 10 +#define F11 11 +#define F12 12 +#define F13 13 +#define F14 14 +#define F15 15 +#define F16 16 +#define F17 17 +#define F18 18 +#define F19 19 +#define F20 20 +#define F21 21 +#define F22 22 +#define F23 23 +#define F24 24 +#define F25 25 +#define F26 26 +#define F27 27 +#define F28 28 +#define F29 29 +#define F30 30 +#define F31 31 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +################################# +## MADD1 a*c +## MADD2 b*c +## MADD3 a*d +## MADD4 d*b +################################## +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -STACKSIZE + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, STACKSIZE + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsra J, N, 1 # J=N/2 + ST ALPHA_R, 128($sp) # store alpha_r & alpha_i + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE + blez J, .L20 + ST ALPHA_I, 136($sp) + + + .align 5 +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + daddiu J, J, -1 + dsra I, M, 1 # I=M/2 + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + + move CO1, C # Fix pointer Cx + daddu CO2, C, LDC + + move AO, A # Reset AO + blez I, .L30 + daddu PREA, PREA, A # PREA=A+panel size + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + FETCH $0, 0 * SIZE(CO2) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 4 * SIZE(CO2) + MOV c41, c11 + MOV c42, c11 + + FETCH $0, 4 * SIZE(CO1) + MOV c43, c11 + MOV c44, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + NOP + +#else + + dsra L, K, 2 # Unroll K 4 times + move BO, B + + MTC $0, c11 # Clear results regs + LD a1, 0 * SIZE(AO) + MOV c12, c11 + LD a2, 1 * SIZE(AO) + + MOV c13, c11 + LD b1, 0 * SIZE(BO) + MOV c14, c11 + LD b2, 1 * SIZE(BO) + + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c22, c11 + LD a4, 3 * SIZE(AO) + + MOV c23, c11 + LD b3, 2 * SIZE(BO) + MOV c24, c11 + LD b4, 3 * SIZE(BO) + + MOV c31, c11 + MOV c32, c11 + FETCH $0, 0 * SIZE(CO2) + + MOV c33, c11 + MOV c34, c11 + FETCH $0, 0 * SIZE(CO1) + + MOV c41, c11 + MOV c42, c11 + FETCH $0, 4 * SIZE(CO2) + + MOV c43, c11 + NOP + FETCH $0, 4 * SIZE(CO1) + + daddu PREB, PREB, B # PREA=A+panel size + blez L, .L15 + MOV c44, c11 +#endif + + .align 5 + +.L12: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 4 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREB) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 8 * SIZE(PREA) + FETCH $0, 8 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + MADD2 c42, c42, a8, b7 + MADD4 c44, c44, a8, b8 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 12 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + daddiu AO, AO, 16 * SIZE + + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + daddu PREA, PREA, 16 * SIZE + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + daddu PREB, PREB, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b5 # A2xB1 + MADD3 c23, c23, a7, b6 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c22, c22, a8, b5 + MADD4 c24, c24, a8, b6 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 0 * SIZE(PREB) + MADD1 c31, c31, a5, b7 # A1xB2 + MADD3 c33, c33, a5, b8 + + MADD2 c32, c32, a6, b7 + MADD4 c34, c34, a6, b8 + + MADD1 c41, c41, a7, b7 # A2xB2 + MADD3 c43, c43, a7, b8 + + MADD2 c42, c42, a8, b7 + bgtz L, .L12 + MADD4 c44, c44, a8, b8 + + .align 5 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L18 + LD ALPHA_I, 136($sp) + + .align 5 + +.L16: + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + FETCH $0, 0 * SIZE(PREA) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + daddiu L, L, -1 + + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + FETCH $0, 0 * SIZE(PREB) + MADD1 c41, c41, a3, b3 # A2xB2 + MADD3 c43, c43, a3, b4 + + MADD2 c42, c42, a4, b3 + MADD4 c44, c44, a4, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L16 + NOP + +.L18: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + ADD c41, c44, c41 + LD b3, 2 * SIZE(CO2) + ADD c42, c43, c42 + LD b4, 3 * SIZE(CO2) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + ST a1, 0 * SIZE(CO1) + MADD b3, b3, ALPHA_R, c41 + MADD b4, b4, ALPHA_R, c42 + ST a2, 1 * SIZE(CO1) + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + ST b1, 2 * SIZE(CO1) + + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + ADD c31, c34, c31 + ADD c32, c33, c32 + ADD c41, c44, c41 + ADD c42, c43, c42 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + MUL b3, ALPHA_R, c41 + MUL b4, ALPHA_R, c42 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + NMSUB b3, b3, ALPHA_I, c42 + MADD b4, b4, ALPHA_I, c41 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + ST b3, 2 * SIZE(CO2) + ST b4, 3 * SIZE(CO2) + + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L11 + daddiu CO2,CO2, 4 * SIZE + + .align 5 +.L30: + andi I, M, 1 + daddu C, C, LDC # Change C to next panel + + daddu PREB, PREB, B # PREA=A+panel size + blez I, .L19 + daddu C, C, LDC # Change C to next panel + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT # MR=1 + dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c33, c11 + MOV c34, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 # MR=1 +#else + daddiu TEMP, KK, 2 # NR=2 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP + +#else + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREB) + MOV c31, c11 + MOV c32, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 0 * SIZE(CO2) + FETCH $0, 4 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO2) + + MOV c33, c11 + blez L, .L35 + MOV c34, c11 +#endif + + .align 5 + +.L32: + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + FETCH $0, 4 * SIZE(PREB) + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + NOP + + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD3 c13, c13, a3, b6 # axd + + LD b1, 8 * SIZE(BO) + LD b2, 9 * SIZE(BO) + MADD2 c12, c12, a4, b5 # bxc + MADD4 c14, c14, a4, b6 # bxd + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + MADD1 c31, c31, a3, b7 # A1xB2 + MADD3 c33, c33, a3, b8 + + FETCH $0, 8 * SIZE(PREB) + MADD2 c32, c32, a4, b7 + MADD4 c34, c34, a4, b8 + daddiu L, L, -1 + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD3 c13, c13, a5, b2 # axd + + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + MADD2 c12, c12, a6, b1 # bxc + MADD4 c14, c14, a6, b2 # bxd + + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + MADD1 c31, c31, a5, b3 # A1xB2 + MADD3 c33, c33, a5, b4 + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREB) + MADD2 c32, c32, a6, b3 + MADD4 c34, c34, a6, b4 + NOP + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD3 c13, c13, a7, b6 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b5 # bxc + MADD4 c14, c14, a8, b6 # bxd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD1 c31, c31, a7, b7 # A1xB2 + NOP + + MADD3 c33, c33, a7, b8 + daddiu PREB, PREB, 16 * SIZE + + FETCH $0, 0 * SIZE(PREB) + MADD2 c32, c32, a8, b7 + bgtz L, .L32 + MADD4 c34, c34, a8, b8 + + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L38 + LD ALPHA_I, 136($sp) + .align 5 + +.L36: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx + MADD1 c31, c31, a1, b3 # A1xB2 + MADD3 c33, c33, a1, b4 + + daddiu PREB, PREB, 4 * SIZE + MADD2 c32, c32, a2, b3 + MADD4 c34, c34, a2, b4 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + bgtz L, .L36 + NOP + +.L38: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + + ADD c31, c34, c31 + LD a3, 0 * SIZE(CO2) + ADD c32, c33, c32 + LD a4, 1 * SIZE(CO2) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + MADD a3, a3, ALPHA_R, c31 + MADD a4, a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + ADD c31, c34, c31 + ADD c32, c33, c32 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + MUL a3, ALPHA_R, c31 + MUL a4, ALPHA_R, c32 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + NMSUB a3, a3, ALPHA_I, c32 + MADD a4, a4, ALPHA_I, c31 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + + ST a3, 0 * SIZE(CO2) + ST a4, 1 * SIZE(CO2) + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + + .align 5 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + bgtz J, .L10 + move B, BO + + .align 5 + +.L20: + andi J, N, 1 + blez J, .L999 + dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 + + dsra I, M, 1 # I=M/2 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move AO, A # Reset AO + blez I, .L29 + daddu PREA, PREA, A + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + ZBASE_SHIFT + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 # define Mr=2 +#else + daddiu TEMP, KK, 1 # define NR=1 +#endif + dsra L, TEMP, 2 + blez L, .L25 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MOV c21, c11 + MOV c22, c11 + + FETCH $0, 0 * SIZE(PREA) + MOV c23, c11 + MOV c24, c11 + + FETCH $0, 0 * SIZE(CO1) + FETCH $0, 4 * SIZE(CO1) + + blez L, .L25 + NOP +#endif + + .align 5 + +.L22: + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + FETCH $0, 4 * SIZE(PREA) + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD3 c13, c13, a5, b4 # axd + + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a6, b3 # bxc + MADD4 c14, c14, a6, b4 # bxd + + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + MADD1 c21, c21, a7, b3 # A2xB1 + MADD3 c23, c23, a7, b4 + + FETCH $0, 8 * SIZE(PREA) + MADD2 c22, c22, a8, b3 + MADD4 c24, c24, a8, b4 + daddiu L, L, -1 + + LD a5, 12 * SIZE(AO) + LD a6, 13 * SIZE(AO) + MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD3 c13, c13, a1, b6 # axd + + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a2, b5 # bxc + MADD4 c14, c14, a2, b6 # bxd + + LD a7, 14 * SIZE(AO) + LD a8, 15 * SIZE(AO) + MADD1 c21, c21, a3, b5 # A2xB1 + MADD3 c23, c23, a3, b6 + + daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx + daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx + + FETCH $0, 12 * SIZE(PREA) + MADD2 c22, c22, a4, b5 + MADD4 c24, c24, a4, b6 + daddiu PREA, PREA, 16 * SIZE + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD3 c13, c13, a5, b8 # axd + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a6, b7 # bxc + MADD4 c14, c14, a6, b8 # bxd + + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c21, c21, a7, b7 # A2xB1 + MADD3 c23, c23, a7, b8 + + FETCH $0, 0 * SIZE(PREA) + MADD2 c22, c22, a8, b7 + bgtz L, .L22 + MADD4 c24, c24, a8, b8 + + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L28 + LD ALPHA_I, 136($sp) + .align 3 + +.L26: + daddiu L, L, -1 + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + + daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + + daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx + MADD1 c21, c21, a3, b1 # A2xB1 + MADD3 c23, c23, a3, b2 + + daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx + MADD2 c22, c22, a4, b1 + MADD4 c24, c24, a4, b2 + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + bgtz L, .L26 + FETCH $0, 0 * SIZE(PREA) + +.L28: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + LD a1, 0 * SIZE(CO1) + ADD c12, c13, c12 + LD a2, 1 * SIZE(CO1) + ADD c21, c24, c21 + LD b1, 2 * SIZE(CO1) + ADD c22, c23, c22 + LD b2, 3 * SIZE(CO1) + + daddiu I, I, -1 + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + MADD b1, b1, ALPHA_R, c21 + MADD b2, b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + ADD c21, c24, c21 + ADD c22, c23, c22 + + daddiu I, I, -1 + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + MUL b1, ALPHA_R, c21 + MUL b2, ALPHA_R, c22 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + NMSUB b1, b1, ALPHA_I, c22 + MADD b2, b2, ALPHA_I, c21 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + ST b1, 2 * SIZE(CO1) + ST b2, 3 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + ZBASE_SHIFT + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + daddiu CO1,CO1, 4 * SIZE + bgtz I, .L21 + NOP + +.L29: + andi I, M, 1 + blez I, .L999 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L45 + NOP + +#else + dsra L, K, 2 # Unroll K 4 times + move BO, B + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MTC $0, c11 # Clear results regs + MOV c12, c11 + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MOV c13, c11 + MOV c14, c11 + + FETCH $0, 0 * SIZE(PREA) + FETCH $0, 4 * SIZE(PREA) + blez L, .L45 + NOP +#endif + + .align 3 + +.L42: +# gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + +# gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 + LD a5, 4 * SIZE(AO) + LD a6, 5 * SIZE(AO) + MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD3 c13, c13, a3, b4 # axd + +# gsLQC1(R13, F13, F12, 2) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + MADD2 c12, c12, a4, b3 # bxc + MADD4 c14, c14, a4, b4 # bxd + +# gsLQC1(R12, F11, F10, 3) + LD a7, 6 * SIZE(AO) + LD a8, 7 * SIZE(AO) + MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD3 c13, c13, a5, b6 # axd + + daddiu L, L, -1 + +# gsLQC1(R13, F16, F15, 3) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + MADD2 c12, c12, a6, b5 # bxc + MADD4 c14, c14, a6, b6 # bxd + + daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx + daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD3 c13, c13, a7, b8 # axd + +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + MADD2 c12, c12, a8, b7 # bxc + MADD4 c14, c14, a8, b8 # bxd + + bgtz L, .L42 + NOP + + + .align 5 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 + LD ALPHA_R, 128($sp) +#else + andi L, TEMP, 3 + LD ALPHA_R, 128($sp) +#endif + blez L, .L48 + LD ALPHA_I, 136($sp) + +.L46: + daddiu L, L, -1 + daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx + daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx + + MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD3 c13, c13, a1, b2 # axd + MADD2 c12, c12, a2, b1 # bxc + MADD4 c14, c14, a2, b2 # bxd + +# gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 +# gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + bgtz L, .L46 + NOP + +.L48: +#ifndef TRMMKERNEL + ADD c11, c14, c11 + ADD c12, c13, c12 + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + + MADD a1, a1, ALPHA_R, c11 + MADD a2, a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#else + ADD c11, c14, c11 + ADD c12, c13, c12 + + MUL a1, ALPHA_R, c11 + MUL a2, ALPHA_R, c12 + + NMSUB a1, a1, ALPHA_I, c12 + MADD a2, a2, ALPHA_I, c11 + + ST a1, 0 * SIZE(CO1) + ST a2, 1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + daddiu CO1,CO1, 2 * SIZE +#endif + + + + .align 5 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, STACKSIZE + + EPILOGUE diff --git a/param.h b/param.h index 610eb5fab..1cf08a3fa 100644 --- a/param.h +++ b/param.h @@ -1521,13 +1521,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 From c2dad58ad1ccbd1476827a1ccb615bd002248980 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Thu, 1 Dec 2011 16:33:11 +0000 Subject: [PATCH 36/52] Adding n32 multiple threads condition. --- common_linux.h | 6 +++++- common_mips64.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/common_linux.h b/common_linux.h index 45a688d23..8d9019a0d 100644 --- a/common_linux.h +++ b/common_linux.h @@ -68,8 +68,12 @@ extern long int syscall (long int __sysno, ...); static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { -#if defined (LOONGSON3B) +#if defined (LOONGSON3B) +#if defined (__64BIT__) return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +#else + return 0; //NULL Implementation on Loongson 3B 32bit. +#endif #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 unsigned long null_nodemask=0; diff --git a/common_mips64.h b/common_mips64.h index 560f2c372..85348377e 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -120,6 +120,7 @@ static inline unsigned int rpcc(void){ } #if defined(LOONGSON3A) || defined(LOONGSON3B) +#ifndef NO_AFFINITY #define WHEREAMI static inline int WhereAmI(void){ int ret=0; @@ -131,6 +132,7 @@ static inline int WhereAmI(void){ } #endif +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; From a4292976e91eeab0c0e8aa8e6b81a9074e9933cb Mon Sep 17 00:00:00 2001 From: traz Date: Mon, 5 Dec 2011 14:54:25 +0000 Subject: [PATCH 37/52] Adding detection of complex situations in symm.c, otherwise the buffer address of sb will overlap the end of sa. --- interface/symm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/interface/symm.c b/interface/symm.c index a0d52c49d..b447f13e8 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) @@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FLOAT *sa, *sb; #ifdef SMP +#ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) @@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #else int mode = BLAS_SINGLE | BLAS_REAL; #endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif #endif #if defined(SMP) && !defined(NO_AFFINITY) From 8e53b57bb26e4e9ac32cbb0b362a7946e2028944 Mon Sep 17 00:00:00 2001 From: Wang Qian Date: Tue, 10 Jan 2012 17:16:13 +0000 Subject: [PATCH 38/52] Appending gemmkernel and trmmkernel C code in kernel/generic, this code can be used to execute on a new platform which dose not have optimized assemble kernel. --- kernel/Makefile.L3 | 86 +++ kernel/generic/gemmkernel_2x2.c | 157 ++++++ kernel/generic/trmmkernel_2x2.c | 280 ++++++++++ kernel/generic/zgemmkernel_2x2.c | 838 ++++++++++++++++++++++++++++ kernel/generic/ztrmmkernel_2x2.c | 923 +++++++++++++++++++++++++++++++ kernel/mips64/KERNEL.LOONGSON3B | 20 +- 6 files changed, 2296 insertions(+), 8 deletions(-) create mode 100644 kernel/generic/gemmkernel_2x2.c create mode 100644 kernel/generic/trmmkernel_2x2.c create mode 100644 kernel/generic/zgemmkernel_2x2.c create mode 100644 kernel/generic/ztrmmkernel_2x2.c diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4e331a445..4f419dc80 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -498,6 +498,91 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ +ifeq ($(TARGET), LOONGSON3B) +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -581,6 +666,7 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c new file mode 100644 index 000000000..3645ef154 --- /dev/null +++ b/kernel/generic/gemmkernel_2x2.c @@ -0,0 +1,157 @@ +#include "common.h" +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; + for (j=0; j Date: Wed, 11 Jan 2012 16:05:39 +0000 Subject: [PATCH 39/52] Modify P Q R size of Loongson3b. --- param.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/param.h b/param.h index 1cf08a3fa..72d721d4e 100644 --- a/param.h +++ b/param.h @@ -1521,11 +1521,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -1534,19 +1534,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 44 -#define CGEMM_DEFAULT_P 64 -#define ZGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_P 24 +#define CGEMM_DEFAULT_P 24 +#define ZGEMM_DEFAULT_P 20 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 92 +#define DGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 80 +#define ZGEMM_DEFAULT_Q 64 -#define SGEMM_DEFAULT_R 1024 -#define DGEMM_DEFAULT_R dgemm_r -#define CGEMM_DEFAULT_R 1024 -#define ZGEMM_DEFAULT_R 1024 +#define SGEMM_DEFAULT_R 512 +#define DGEMM_DEFAULT_R 512 +#define CGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 512 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 From 0a696bd4ce25abd7525327b889d9f9da418172ee Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 20 Feb 2012 23:36:58 +0800 Subject: [PATCH 40/52] Improved the makefile for Intel compiler. --- Makefile | 1 + f_check | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 2f56480f9..eb9c4ff31 100644 --- a/Makefile +++ b/Makefile @@ -285,6 +285,7 @@ clean :: #ifdef DYNAMIC_ARCH @$(MAKE) -C kernel clean #endif + @$(MAKE) -C reference clean @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @if test -d lapack-3.4.0; then \ diff --git a/f_check b/f_check index 45a946eb6..f5bb5a7f6 100644 --- a/f_check +++ b/f_check @@ -284,6 +284,10 @@ if ($link ne "") { } +if ($vendor eq "INTEL"){ + $linker_a .= "-lgfortran" +} + open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; open(CONFFILE, ">> $config" ) || die "Can't append $config"; From 70abe10fc06142e6000b62dc931036f8987211cc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 7 Mar 2012 23:14:25 +0800 Subject: [PATCH 41/52] Check new LAPACK version in generating shared library. --- exports/gensymbol | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/gensymbol b/exports/gensymbol index 3d8d74dde..6a7c8c1b4 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -301,7 +301,7 @@ if ($ARGV[5] == 1) { #NO_LAPACK=1 @objs = (@blasobjs); -} elsif (-d "../lapack-3.1.1") { +} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") { @objs = (@blasobjs, @lapackobjs, @lapackobjs2); } else { @objs = (@blasobjs, @lapackobjs); From 0bbf955d4c07ea1388aa025890be009f814394ae Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 12 Mar 2012 18:20:37 +0800 Subject: [PATCH 42/52] Refs #74. Added -lgfortran into generating shared library. --- Makefile.system | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index e8ba3694e..3f5c7b4fa 100644 --- a/Makefile.system +++ b/Makefile.system @@ -341,7 +341,8 @@ endif ifeq ($(F_COMPILER), GFORTRAN) CCOMMON_OPT += -DF_INTERFACE_GFORT -FCOMMON_OPT += -Wall +FCOMMON_OPT += -Wall +EXTRALIB += -lgfortran ifdef NO_BINARY_MODE ifeq ($(ARCH), mips64) ifdef BINARY64 From a7a7751be78b839ef4e3a34bf4a2cc7c97a7cd8d Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 14 Mar 2012 17:08:21 +0800 Subject: [PATCH 43/52] Export CBLAS funtions on Windows DLL. --- exports/Makefile | 12 ++++++------ exports/gensymbol | 7 +++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 69050989c..2db6b6daa 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -58,16 +58,16 @@ dll : ../$(LIBDLLNAME) dll2 : libgoto2_shared.dll -../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) +../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) - $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) - -lib /machine:i386 /def:libgoto2.def + -lib /machine:i386 /def:libopenblas.def else - $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ + $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) - -lib /machine:X64 /def:libgoto2.def + -lib /machine:X64 /def:libopenblas.def endif libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def @@ -75,7 +75,7 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) -libgoto2.def : gensymbol +libopenblas.def : gensymbol perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) libgoto2_shared.def : gensymbol diff --git a/exports/gensymbol b/exports/gensymbol index 6a7c8c1b4..6b2a00672 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -389,6 +389,13 @@ if ($ARGV[0] eq "win2k"){ $count ++; } + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print "\t",$objs,"=$objs"," \@", $count, "\n"; + $count ++; + } + } + exit(0); } From d047afe61571d6b8fab6ffb896ff48f147cc8f07 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 15 Mar 2012 01:07:34 +0800 Subject: [PATCH 44/52] Set shared library soname in Linux. --- Makefile | 1 + Makefile.install | 1 + Makefile.system | 1 + exports/Makefile | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eb9c4ff31..a320112a5 100644 --- a/Makefile +++ b/Makefile @@ -83,6 +83,7 @@ shared : ifeq ($(OSNAME), Linux) $(MAKE) -C exports so -ln -fs $(LIBSONAME) libopenblas.so + -ln -fs $(LIBSONAME) libopenblas.so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) $(MAKE) -C exports so diff --git a/Makefile.install b/Makefile.install index 2778a491f..dbc28c568 100644 --- a/Makefile.install +++ b/Makefile.install @@ -44,6 +44,7 @@ install : lib.grd ifeq ($(OSNAME), Linux) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) diff --git a/Makefile.system b/Makefile.system index 3f5c7b4fa..9f994a716 100644 --- a/Makefile.system +++ b/Makefile.system @@ -637,6 +637,7 @@ MD5SUM = md5sum AWK = awk REVISION = -r$(VERSION) +MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) diff --git a/exports/Makefile b/exports/Makefile index 2db6b6daa..28a0882e3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -100,7 +100,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def -Wl,-soname,libopenblas.so.$(MAJOR_VERSION) $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From 722dd087032261b62867e6b203e8b77baf1653aa Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 16 Mar 2012 20:29:39 +0800 Subject: [PATCH 45/52] ref #80. On P4 CPU with 32-bit Windows XP, Octave crashed with OpenBLAS. Walkaroud: Use netlib reference gemv instead of own funtions. For example, make USE_NETLIB_GEMV=1 --- interface/Makefile | 18 ++- interface/netlib/cgemv.f | 285 +++++++++++++++++++++++++++++++++++++++ interface/netlib/dgemv.f | 265 ++++++++++++++++++++++++++++++++++++ interface/netlib/sgemv.f | 265 ++++++++++++++++++++++++++++++++++++ interface/netlib/zgemv.f | 285 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 1117 insertions(+), 1 deletion(-) create mode 100644 interface/netlib/cgemv.f create mode 100644 interface/netlib/dgemv.f create mode 100644 interface/netlib/sgemv.f create mode 100644 interface/netlib/zgemv.f diff --git a/interface/Makefile b/interface/Makefile index 6764daa95..5cf11cd9b 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -770,20 +770,36 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) +ifndef USE_NETLIB_GEMV sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< +else +sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< + +dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< +endif qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< - + +ifndef USE_NETLIB_GEMV cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< +else +cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< + +zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< +endif xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< diff --git a/interface/netlib/cgemv.f b/interface/netlib/cgemv.f new file mode 100644 index 000000000..d9e55f9a2 --- /dev/null +++ b/interface/netlib/cgemv.f @@ -0,0 +1,285 @@ + SUBROUTINE CGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + COMPLEX ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* CGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or +* +* y := alpha*A**H*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER (ONE= (1.0E+0,0.0E+0)) + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY + LOGICAL NOCONJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* + NOCONJ = LSAME(TRANS,'T') +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = ZERO + IF (NOCONJ) THEN + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + ELSE + DO 100 I = 1,M + TEMP = TEMP + CONJG(A(I,J))*X(I) + 100 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140 J = 1,N + TEMP = ZERO + IX = KX + IF (NOCONJ) THEN + DO 120 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130 I = 1,M + TEMP = TEMP + CONJG(A(I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMV . +* + END diff --git a/interface/netlib/dgemv.f b/interface/netlib/dgemv.f new file mode 100644 index 000000000..a41259412 --- /dev/null +++ b/interface/netlib/dgemv.f @@ -0,0 +1,265 @@ + SUBROUTINE DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* DGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE,ZERO + PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) +* .. +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('DGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = ZERO + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120 J = 1,N + TEMP = ZERO + IX = KX + DO 110 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGEMV . +* + END diff --git a/interface/netlib/sgemv.f b/interface/netlib/sgemv.f new file mode 100644 index 000000000..afae26980 --- /dev/null +++ b/interface/netlib/sgemv.f @@ -0,0 +1,265 @@ + SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + REAL ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + REAL A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* SGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE,ZERO + PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) +* .. +* .. Local Scalars .. + REAL TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('SGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = ZERO + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120 J = 1,N + TEMP = ZERO + IX = KX + DO 110 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGEMV . +* + END diff --git a/interface/netlib/zgemv.f b/interface/netlib/zgemv.f new file mode 100644 index 000000000..bb2ae4fcb --- /dev/null +++ b/interface/netlib/zgemv.f @@ -0,0 +1,285 @@ + SUBROUTINE ZGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA,BETA + INTEGER INCX,INCY,LDA,M,N + CHARACTER TRANS +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),X(*),Y(*) +* .. +* +* Purpose +* ======= +* +* ZGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or +* +* y := alpha*A**H*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Arguments +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* Further Details +* =============== +* +* Level 2 Blas routine. +* The vector and matrix arguments are not referenced when N = 0, or M = 0 +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER (ONE= (1.0D+0,0.0D+0)) + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY + LOGICAL NOCONJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 1 + ELSE IF (M.LT.0) THEN + INFO = 2 + ELSE IF (N.LT.0) THEN + INFO = 3 + ELSE IF (LDA.LT.MAX(1,M)) THEN + INFO = 6 + ELSE IF (INCX.EQ.0) THEN + INFO = 8 + ELSE IF (INCY.EQ.0) THEN + INFO = 11 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZGEMV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN +* + NOCONJ = LSAME(TRANS,'T') +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF (LSAME(TRANS,'N')) THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF (INCX.GT.0) THEN + KX = 1 + ELSE + KX = 1 - (LENX-1)*INCX + END IF + IF (INCY.GT.0) THEN + KY = 1 + ELSE + KY = 1 - (LENY-1)*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF (BETA.NE.ONE) THEN + IF (INCY.EQ.1) THEN + IF (BETA.EQ.ZERO) THEN + DO 10 I = 1,LENY + Y(I) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1,LENY + Y(I) = BETA*Y(I) + 20 CONTINUE + END IF + ELSE + IY = KY + IF (BETA.EQ.ZERO) THEN + DO 30 I = 1,LENY + Y(IY) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1,LENY + Y(IY) = BETA*Y(IY) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF (ALPHA.EQ.ZERO) RETURN + IF (LSAME(TRANS,'N')) THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF (INCY.EQ.1) THEN + DO 60 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80 J = 1,N + IF (X(JX).NE.ZERO) THEN + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A**T*x + y or y := alpha*A**H*x + y. +* + JY = KY + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = ZERO + IF (NOCONJ) THEN + DO 90 I = 1,M + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + ELSE + DO 100 I = 1,M + TEMP = TEMP + DCONJG(A(I,J))*X(I) + 100 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140 J = 1,N + TEMP = ZERO + IX = KX + IF (NOCONJ) THEN + DO 120 I = 1,M + TEMP = TEMP + A(I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130 I = 1,M + TEMP = TEMP + DCONJG(A(I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + END IF + Y(JY) = Y(JY) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZGEMV . +* + END From dff146e306e5ea8accb2463b9e35bfa83297dc01 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 19 Mar 2012 17:56:22 +0800 Subject: [PATCH 46/52] refs #80. Used GEMV SSE2 kernels on x86. --- kernel/x86/KERNEL | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 69becf69f..83fd551bc 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -239,6 +239,22 @@ ifndef ZSWAPKERNEL ZSWAPKERNEL = zswap_sse2.S endif +ifndef DGEMVNKERNEL +DGEMVNKERNEL = gemv_n_sse2.S +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = gemv_t_sse2.S +endif + +ifndef ZGEMVNKERNEL +ZGEMVNKERNEL = zgemv_n_sse2.S +endif + +ifndef ZGEMVTKERNEL +ZGEMVTKERNEL = zgemv_t_sse2.S +endif + endif From 1f15bee02acaff0255566360a36a0fc2e30441c6 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 21 Mar 2012 23:57:09 +0800 Subject: [PATCH 47/52] Refs #81. Added LIBNAMESUFFIX flag in Makefile.rule. The user can use this flag to control the library name, e.g. libopenblas.a, libopenblas_ifort.a or libopenblas_omp.a. --- Makefile | 26 +++++++++++++------------- Makefile.install | 16 ++++++++-------- Makefile.rule | 5 +++++ Makefile.system | 4 ++++ exports/Makefile | 2 +- 5 files changed, 31 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index a320112a5..ba04aa989 100644 --- a/Makefile +++ b/Makefile @@ -82,28 +82,28 @@ endif shared : ifeq ($(OSNAME), Linux) $(MAKE) -C exports so - -ln -fs $(LIBSONAME) libopenblas.so - -ln -fs $(LIBSONAME) libopenblas.so.$(MAJOR_VERSION) + -ln -fs $(LIBSONAME) $(LIBPREFIX).so + -ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) $(MAKE) -C exports so - -ln -fs $(LIBSONAME) libopenblas.so + -ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) $(MAKE) -C exports so - -ln -fs $(LIBSONAME) libopenblas.so + -ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) $(MAKE) -C exports dyn - -ln -fs $(LIBDYNNAME) libopenblas.dylib + -ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) libopenblas.dll + -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) libopenblas.dll + -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif tests : @@ -131,7 +131,7 @@ endif ifeq ($(NOFORTRAN), 1) $(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.) endif - -ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) + -ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -159,7 +159,7 @@ endif prof : prof_blas prof_lapack prof_blas : - ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX) + ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d prof || exit 1 ; \ @@ -170,7 +170,7 @@ ifdef DYNAMIC_ARCH endif blas : - ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d libs || exit 1 ; \ @@ -178,7 +178,7 @@ blas : done hpl : - ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -192,7 +192,7 @@ ifdef DYNAMIC_ARCH endif hpl_p : - ln -fs $(LIBNAME_P) libopenblas_p.$(LIBSUFFIX) + ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -287,7 +287,7 @@ clean :: @$(MAKE) -C kernel clean #endif @$(MAKE) -C reference clean - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @if test -d lapack-3.4.0; then \ echo deleting lapack-3.4.0; \ diff --git a/Makefile.install b/Makefile.install index dbc28c568..46105fc39 100644 --- a/Makefile.install +++ b/Makefile.install @@ -38,34 +38,34 @@ install : lib.grd #for install static library @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) @cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR) - @-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.$(LIBSUFFIX) + @-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX) #for install shared library @echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), Linux) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so.$(MAJOR_VERSION) + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) -cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.so + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) -cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR) -install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dylib + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/libopenblas.dll + -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif @echo Install OK! diff --git a/Makefile.rule b/Makefile.rule index db1a48d9f..a73a9553c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -5,6 +5,11 @@ # This library's version VERSION = 0.1alpha2.5 +# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a +# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library +# is libopenblas_$(LIBNAMESUFFIX).so.0. +# LIBNAMESUFFIX = omp + # You can specify the target architecture, otherwise it's # automatically detected. # TARGET = PENRYN diff --git a/Makefile.system b/Makefile.system index 9f994a716..8ec93031e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -569,7 +569,11 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif +ifndef LIBNAMESUFFIX LIBPREFIX = libopenblas +else +LIBPREFIX = libopenblas_$(LIBNAMESUFFIX) +endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) diff --git a/exports/Makefile b/exports/Makefile index 28a0882e3..873e8b270 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -100,7 +100,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def -Wl,-soname,libopenblas.so.$(MAJOR_VERSION) $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From 31c836ac255a1d23e5694ab85c760edc0c6e0214 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 23 Mar 2012 01:17:41 +0800 Subject: [PATCH 48/52] Ref #79 Added GEMM_MULTITHREAD_THRESHOLD flag to use single thread in gemm function with small matrices. --- Makefile.rule | 5 +++++ Makefile.system | 5 +++++ getarch_2nd.c | 1 + interface/gemm.c | 7 ++++++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index a73a9553c..f7d60b052 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -88,6 +88,11 @@ VERSION = 0.1alpha2.5 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 +# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# with single thread. You can use this flag to avoid the overhead of multi-threading +# in small matrix sizes. The default value is 4. +# GEMM_MULTITHREAD_THRESHOLD = 4 + # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 diff --git a/Makefile.system b/Makefile.system index 8ec93031e..b8b9ba837 100644 --- a/Makefile.system +++ b/Makefile.system @@ -40,6 +40,11 @@ ifdef INTERFACE64 GETARCH_FLAGS += -DUSE64BITINT endif +ifndef GEMM_MULTITHREAD_THRESHOLD +GEMM_MULTITHREAD_THRESHOLD=4 +endif +GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 diff --git a/getarch_2nd.c b/getarch_2nd.c index 018f08d31..5339af442 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -34,6 +34,7 @@ int main(int argc, char **argv) { #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif + printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); } return 0; diff --git a/interface/gemm.c b/interface/gemm.c index 7919f822e..28cf5372d 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -397,8 +397,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transb << BLAS_TRANSB_SHIFT); args.common = NULL; - args.nthreads = num_cpu_avail(3); + if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD + || args.k <=GEMM_MULTITHREAD_THRESHOLD){ + args.nthreads = 1; + }else{ + args.nthreads = num_cpu_avail(3); + } if (args.nthreads == 1) { #endif From ccdba3c7711059b807851529cb9fdfa2ae3246e8 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 23 Mar 2012 01:29:05 +0800 Subject: [PATCH 49/52] Updated the version to 0.1.0. --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index f7d60b052..650478a07 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1alpha2.5 +VERSION = 0.1.0 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 91ce66a0a8a895279d8aabed3303b43bb8351808 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 23 Mar 2012 15:15:05 +0800 Subject: [PATCH 50/52] Ref #82 fixed the bug in my_mbind function. --- common_linux.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_linux.h b/common_linux.h index 8d9019a0d..b0381d991 100644 --- a/common_linux.h +++ b/common_linux.h @@ -76,8 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, #endif #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 - unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, &null_nodemask, maxnode, flags); +// unsigned long null_nodemask=0; + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif } From 0b89a7a92d4464acf90861e715eedc4d6cf85fbf Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 23 Mar 2012 18:17:12 +0800 Subject: [PATCH 51/52] Ref #82. Disable outputing debug information in alloc_mmap. --- driver/others/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index feb45eb58..3f1a5f60a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -389,8 +389,7 @@ static void *alloc_mmap(void *address){ if (map_address != (void *)-1) { #ifdef OS_LINUX -#if 1 - //#ifdef DEBUG +#ifdef DEBUG int ret=0; ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ From 2b3eae6cc782767f690399e39ed94ba479737437 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 23 Mar 2012 18:45:54 +0800 Subject: [PATCH 52/52] Ref #70 Updated Changelog.txt. --- Changelog.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index ae2a77e5a..e122300ec 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.1.0 +23-Mar-2012 +common: + * Set soname of shared library on Linux. + * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use + this flag to control the library name, e.g. libopenblas.a, + libopenblas_ifort.a or libopenblas_omp.a. + * Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule. + The lib use single thread in GEMM function with small matrices. +x86/x86_64: + * Used GEMV SSE/SSE2 kernels on x86 32-bit. + * Exported CBLAS functions in Windows DLL. +MIPS64: + * Completed Level-3 BLAS optimization on Loongson 3A CPU. + * Improved GEMV performance on Loongson 3A CPU. + * Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT) + ==================================================================== Version 0.1 alpha2.5 19-Feb-2012