diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 075957038..02a0833dd 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -2,13 +2,12 @@ #define ASSEMBLER #include "common.h" +#define FETCH ld +#define STACKSIZE 192 #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) -#define FETCH ld -#define STACKSIZE 192 - ##### Parameter registers #### #define M $4 @@ -115,13 +114,13 @@ #define R16 16 #define R17 17 - #.text -#.align 2 -# .globl REALNAME +# .text +# .align 2 +## .globl gemm # .set nomips16 -# .ent REALNAME -# .type REALNAME, @function -#REALNAME: +# .ent gemm +# .type gemm, @function +#gemm: # .frame $fp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 # .mask 0x40000000,-8 # .fmask 0x00000000,0 @@ -166,11 +165,8 @@ .L4: dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE - - ST ALPHA, 152($fp) # Store alpha blez J, .L2 - NOP - + ST ALPHA, 152($fp) .L48: dsra I, M, 3 # MR=8 @@ -189,9 +185,9 @@ daddu C, CO4, LDC .align 4 -.L488: +.L481: move BO, B # Reset B - dsra L, K, 2 # UnRoll K=8 + dsra L, K, 6 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 @@ -233,10 +229,10 @@ FETCH $0, 0 * SIZE(CO4) PLU B4, B2, B2 - blez L, .L484 - FETCH $0, 0 * SIZE(CO4) + blez L, .L482 + FETCH $0, 4 * SIZE(CO4) -.L4880: +.L4810: daddiu L, L, -1 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -252,21 +248,21 @@ MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 - FETCH $0, 0 * SIZE(PREA) MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 - FETCH $0, 0 * SIZE(PREB) MADPS C14, C14, A1, B4 PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) - FETCH $0, 4 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -285,21 +281,21 @@ MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 - FETCH $0, 4 * SIZE(PREB) + MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - FETCH $0, 8 * SIZE(PREA) MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 - FETCH $0, 12 * SIZE(PREA) MADPS C14, C14, A5, B8 PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 @@ -316,25 +312,25 @@ MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 - FETCH $0, 16 * SIZE(PREA) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR - FETCH $0, 20 * SIZE(PREA) MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 - FETCH $0, 8 * SIZE(PREB) MADPS C14, C14, A1, B4 PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 @@ -353,45 +349,4246 @@ MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 - FETCH $0, 12 * SIZE(PREB) + MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE - FETCH $0, 24 * SIZE(PREA) MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 - FETCH $0, 28 * SIZE(PREA) MADPS C14, C14, A5, B8 PLU B3, B1, B1 - daddiu PREB, PREB, 16 * SIZE + FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE - + MADPS C34, C34, A7, B8 - bgtz L, .L4880 MADPS C44, C44, A8, B8 - .align 4 -.L484: - andi L, K, 4 - blez L, .L482 - NOP - + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + bgtz L, .L4810 + MADPS C44, C44, A8, B8 + .align 4 .L482: - andi L, K, 2 - blez L, .L481 + andi L, K, 32 + blez L, .L483 NOP + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + .align 4 -.L481: +.L483: + andi L, K, 16 + blez L, .L484 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L484: + andi L, K, 8 + blez L, .L485 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L485: + andi L, K, 4 + blez L, .L486 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 4) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 5) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 6) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 7) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 8 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 32 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 16 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 20 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 24 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 28 * SIZE(PREA) + daddiu PREA, PREA, 32 * SIZE + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + + + .align 4 +.L486: + andi L, K, 2 + blez L, .L487 + NOP + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 8 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 16 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + PLU B7, B5, B5 + FETCH $0, 0 * SIZE(PREA) + + MADPS C24, C24, A2, B4 + PLU B8, B6, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + MADPS C11, C11, A5, B5 + MADPS C21, C21, A6, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C12, C12, A5, B6 + MADPS C22, C22, A6, B6 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C31, C31, A7, B5 + MADPS C41, C41, A8, B5 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C32, C32, A7, B6 + MADPS C42, C42, A8, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C13, C13, A5, B7 + MADPS C23, C23, A6, B7 + daddiu PREB, PREB, 8 * SIZE + + MADPS C33, C33, A7, B7 + MADPS C43, C43, A8, B7 + + MADPS C14, C14, A5, B8 + PLU B3, B1, B1 + FETCH $0, 8 * SIZE(PREA) + + MADPS C24, C24, A6, B8 + PLU B4, B2, B2 + FETCH $0, 12 * SIZE(PREA) + + MADPS C34, C34, A7, B8 + MADPS C44, C44, A8, B8 + daddiu PREA, PREA, 16 * SIZE + + + .align 4 +.L487: andi L, K, 1 blez L, .L480 - NOP + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + MADPS C22, C22, A2, B2 + + MADPS C31, C31, A3, B1 + MADPS C41, C41, A4, B1 + + MADPS C32, C32, A3, B2 + MADPS C42, C42, A4, B2 + + MADPS C13, C13, A1, B3 + daddiu BO, BO, 4 * SIZE # 4KR*4NR + MADPS C23, C23, A2, B3 + daddiu AO, AO, 8 * SIZE # 4KR*8MR + + MADPS C33, C33, A3, B3 + MADPS C43, C43, A4, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + MADPS C34, C34, A3, B4 + MADPS C44, C44, A4, B4 + + .align 4 .L480: # Write Back @@ -417,56 +4614,57 @@ CVTU A8, C41 # A8=C41.upper=c28 LD B6, 5 * SIZE(CO2) - MADD A1, B1, A1, ALPHA # c12 + MADD A1, B1, A1, ALPHA # c12 LD B7, 7 * SIZE(CO1) - MADD A2, B2, A2, ALPHA # c22 + MADD A2, B2, A2, ALPHA # c22 LD B1, 7 * SIZE(CO2) - MADD A3, B3, A3, ALPHA # c14 + MADD A3, B3, A3, ALPHA # c14 LD B2, 0 * SIZE(CO1) - MADD A4, B4, A4, ALPHA # c24 + MADD A4, B4, A4, ALPHA # c24 LD B3, 0 * SIZE(CO2) - MADD A5, B5, A5, ALPHA # c16 + MADD A5, B5, A5, ALPHA # c16 LD B4, 2 * SIZE(CO1) - MADD A6, B6, A6, ALPHA # c26 + MADD A6, B6, A6, ALPHA # c26 LD B5, 2 * SIZE(CO2) - MADD A7, B7, A7, ALPHA # c18 + MADD A7, B7, A7, ALPHA # c18 LD B6, 4 * SIZE(CO1) + + MADD A8, B1, A8, ALPHA # c28 ST A1, 1 * SIZE(CO1) - MADD A8, B1, A8, ALPHA # c28 + MADD C11, B2, C11, ALPHA # c12 LD B7, 4 * SIZE(CO2) + + MADD C13, B3, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) - MADD C11, B2, C11, ALPHA # c12 + MADD C21, B4, C21, ALPHA # c14 LD A1, 6 * SIZE(CO1) + + MADD C23, B5, C23, ALPHA # c24 ST A3, 3 * SIZE(CO1) - MADD C13, B3, C13, ALPHA # c22 + MADD C31, B6, C31, ALPHA # c16 LD A2, 6 * SIZE(CO2) + + MADD C33, B7, C33, ALPHA # c26 ST A4, 3 * SIZE(CO2) - MADD C21, B4, C21, ALPHA # c14 ST A5, 5 * SIZE(CO1) - - MADD C23, B5, C23, ALPHA # c24 ST A6, 5 * SIZE(CO2) - - MADD C31, B6, C31, ALPHA # c16 ST A7, 7 * SIZE(CO1) - - MADD C33, B7, C33, ALPHA # c26 ST A8, 7 * SIZE(CO2) - MADD C41, A1, C41, ALPHA # c18 + MADD C41, A1, C41, ALPHA # c18 ST C11, 0 * SIZE(CO1) - MADD C43, A2, C43, ALPHA # c28 + MADD C43, A2, C43, ALPHA # c28 ST C13, 0 * SIZE(CO2) ST C21, 2 * SIZE(CO1) @@ -499,87 +4697,1327 @@ CVTU A8, C42 # A1=C44.upper=c38 LD B7, 7 * SIZE(CO3) - MADD A1, B1, A1, ALPHA # c31 + MADD A1, B1, A1, ALPHA # c31 LD C11, 7 * SIZE(CO4) - MADD A2, B2, A2, ALPHA + MADD A2, B2, A2, ALPHA LD C13, 0 * SIZE(CO3) - MADD A3, B3, A3, ALPHA + MADD A3, B3, A3, ALPHA LD C21, 0 * SIZE(CO4) - MADD A4, B4, A4, ALPHA + MADD A4, B4, A4, ALPHA LD C23, 2 * SIZE(CO3) - MADD A5, B5, A5, ALPHA + MADD A5, B5, A5, ALPHA LD C31, 2 * SIZE(CO4) - MADD A6, B6, A6, ALPHA + MADD A6, B6, A6, ALPHA LD C33, 4 * SIZE(CO3) - MADD A7, B7, A7, ALPHA + MADD A7, B7, A7, ALPHA LD C41, 4 * SIZE(CO4) + + MADD A8, C11, A8, ALPHA ST A1, 1 * SIZE(CO3) - MADD A8, C11, A8, ALPHA + MADD C12, C13, C12, ALPHA LD C43, 6 * SIZE(CO3) + + MADD C14, C21, C14, ALPHA ST A2, 1 * SIZE(CO4) - MADD C12, C13, C12, ALPHA + MADD C22, C23, C22, ALPHA LD B1, 6 * SIZE(CO4) + + MADD C24, C31, C24, ALPHA ST A3, 3 * SIZE(CO3) - MADD C14, C21, C14, ALPHA + MADD C32, C33, C32, ALPHA ST A4, 3 * SIZE(CO4) - MADD C22, C23, C22, ALPHA + MADD C34, C41, C34, ALPHA ST A5, 5 * SIZE(CO3) - MADD C24, C31, C24, ALPHA + MADD C42, C43, C42, ALPHA ST A6, 5 * SIZE(CO4) - MADD C32, C33, C32, ALPHA ST A7, 7 * SIZE(CO3) + NOP - MADD C34, C41, C34, ALPHA + MADD C44, B1, C44, ALPHA ST A8, 7 * SIZE(CO4) - MADD C42, C43, C42, ALPHA ST C12, 0 * SIZE(CO3) - - MADD C44, B1, C44, ALPHA ST C14, 0 * SIZE(CO4) - ST C22, 2 * SIZE(CO3) - daddiu CO1, CO1, 8 * SIZE - ST C24, 2 * SIZE(CO4) - daddiu CO2, CO2, 8 * SIZE - ST C32, 4 * SIZE(CO3) ST C34, 4 * SIZE(CO4) ST C42, 6 * SIZE(CO3) ST C44, 6 * SIZE(CO4) + daddiu CO1, CO1, 8 * SIZE + daddiu CO2, CO2, 8 * SIZE daddiu CO3, CO3, 8 * SIZE - bgtz I, .L488 + bgtz I, .L481 daddiu CO4, CO4, 8 * SIZE -.L44: + .align 4 +.L44: + andi I, M, 4 # MR=4 + blez I, .L42 + NOP + + .align 4 +.L441: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + dsll PREB, K, BASE_SHIFT + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + daddu PREB, B, PREB + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L442 + NOP + +.L4410: # + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F5, F4, 2) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + MADPS C23, C23, A4, B7 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A5, B1 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + MADPS C21, C21, A6, B1 + gsLQC1(R12, F7, F6, 3) # A7 A8 + + MADPS C12, C12, A5, B2 + FETCH $0, 8 * SIZE(PREB) + daddiu BO, BO, 16 * SIZE # 4KR*4NR + + MADPS C22, C22, A6, B2 + FETCH $0, 8 * SIZE(PREA) + daddiu AO, AO, 16 * SIZE # 4KR*4MR + + MADPS C13, C13, A5, B3 + MADPS C23, C23, A6, B3 + + MADPS C14, C14, A5, B4 + MADPS C24, C24, A6, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A7, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A8, B5 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MADPS C12, C12, A7, B6 + FETCH $0, 12 * SIZE(PREB) + + MADPS C22, C22, A8, B6 + FETCH $0, 12 * SIZE(PREA) + + MADPS C13, C13, A7, B7 + daddiu PREA, PREA, 16 * SIZE + MADPS C23, C23, A8, B7 + daddiu PREB, PREB, 16 * SIZE + + MADPS C14, C14, A7, B8 + MADPS C24, C24, A8, B8 + + PLU B3, B1, B1 + bgtz L, .L4410 + PLU B4, B2, B2 + + .align 4 +.L442: + andi L, K, 2 + blez L, .L443 + NOP + + MADPS C11, C11, A1, B1 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C21, C21, A2, B1 + gsLQC1(R12, F3, F2, 1) # A3 A4 + + MADPS C12, C12, A1, B2 + FETCH $0, 0 * SIZE(PREB) + daddiu BO, BO, 8 * SIZE # 2KR*4NR + + MADPS C22, C22, A2, B2 + FETCH $0, 0 * SIZE(PREA) + daddiu AO, AO, 8 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A3, B5 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C21, C21, A4, B5 + gsLQC1(R12, F1, F0, 0) # A5 A6 + + MADPS C12, C12, A3, B6 + FETCH $0, 4 * SIZE(PREB) + + MADPS C22, C22, A4, B6 + FETCH $0, 4 * SIZE(PREA) + + MADPS C13, C13, A3, B7 + daddiu PREB, PREB, 8 + MADPS C23, C23, A4, B7 + daddiu PREA, PREA, 8 + + MADPS C14, C14, A3, B8 + MADPS C24, C24, A4, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + + .align 4 +.L443: + andi L, K, 1 + blez L, .L440 + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C21, C21, A2, B1 + + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 1KR*4NR + MADPS C22, C22, A2, B2 + daddiu AO, AO, 4 * SIZE # 1KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C23, C23, A2, B3 + + MADPS C14, C14, A1, B4 + MADPS C24, C24, A2, B4 + + + .align 4 +.L440: + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + CVTU A3, C23 # A3=C23.upper=c14 + LD B3, 3 * SIZE(CO1) + + CVTU A4, C21 # A4=C21.upper=c24 + LD B4, 3 * SIZE(CO2) + + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD A3, B3, A3, ALPHA # c14 + LD B7, 2 * SIZE(CO1) + + MADD A4, B4, A4, ALPHA # c24 + LD B1, 2 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + MADD C21, B7, C21, ALPHA # c14 + ST A3, 3 * SIZE(CO1) + + MADD C23, B1, C23, ALPHA # c24 + ST A4, 3 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + ST C21, 2 * SIZE(CO1) + ST C23, 2 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + CVTU A3, C24 # B3=C22.upper=c44 + LD B3, 3 * SIZE(CO3) + + CVTU A4, C22 # B4=C24.upper=c34 + LD B4, 3 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD A3, B3, A3, ALPHA + LD A7, 2 * SIZE(CO3) + + MADD A4, B4, A4, ALPHA + LD A8, 2 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + MADD C22, A7, C22, ALPHA + ST A3, 3 * SIZE(CO3) + + MADD C24, A8, C24, ALPHA + ST A4, 3 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + ST C22, 2 * SIZE(CO3) + ST C24, 2 * SIZE(CO4) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + daddiu CO3, CO3, 4 * SIZE + daddiu CO4, CO4, 4 * SIZE + + + .align 4 +.L42: + andi I, M, 2 + blez I, .L41 + NOP + + .align 4 +.L421: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MOV C41, C11 + MOV C42, C11 + gsLQC1(R12, F1, F0, 0) # A1 A2 + + MOV C13, C11 + MOV C14, C11 + + MOV C23, C11 + FETCH $0, 0 * SIZE(CO1) + MOV C24, C11 + + MOV C33, C11 + FETCH $0, 0 * SIZE(CO2) + MOV C34, C11 + + MOV C43, C11 + FETCH $0, 0 * SIZE(CO3) + + MOV C44, C11 + + PLU B3, B1, B1 + FETCH $0, 0 * SIZE(CO4) + + PLU B4, B2, B2 + blez L, .L422 + NOP + +.L4210: + daddiu L, L, -1 + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + gsLQC1(R12, F3, F2, 1) # B1 B2 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + daddiu AO, AO, 8 * SIZE # 4KR*2MR + gsLQC1(R13, F9, F8, 2) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + + PLU B3, B1, B1 + PLU B4, B2, B2 + + MADPS C11, C11, A3, B1 + gsLQC1(R12, F1, F0, 0) # B3 B4 + + MADPS C12, C12, A3, B2 + gsLQC1(R13, F13, F12, 3) # B3 B4 + + daddiu BO, BO, 16 * SIZE # 4KR*4NR + MADPS C13, C13, A3, B3 + MADPS C14, C14, A3, B4 + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A4, B5 + MADPS C12, C12, A4, B6 + gsLQC1(R13, F9, F8, 0) # B3 B4 + + MADPS C13, C13, A4, B7 + MADPS C14, C14, A4, B8 + + PLU B3, B1, B1 + bgtz L, .L4210 + PLU B4, B2, B2 + + .align 4 +.L422: + andi L, K, 2 + blez L, .L423 + NOP + + daddiu AO, AO, 4 * SIZE # 2KR*2MR + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + gsLQC1(R13, F13, F12, 1) # B3 B4 + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + daddiu BO, BO, 8 * SIZE # 2KR*2MR + + PLU B7, B5, B5 + PLU B8, B6, B6 + + MADPS C11, C11, A2, B5 + MADPS C12, C12, A2, B6 + gsLQC1(R13, F9, F8, 0) # B1 B2 + + MADPS C13, C13, A2, B7 + MADPS C14, C14, A2, B8 + gsLQC1(R12, F1, F0, 0) + + PLU B3, B1, B1 + PLU B4, B2, B2 + +.L423: + andi L, K, 1 + blez L, .L420 + LD ALPHA, 152($fp) + + MADPS C11, C11, A1, B1 + MADPS C12, C12, A1, B2 + daddiu BO, BO, 4 * SIZE # 2KR*4NR + daddiu AO, AO, 2 * SIZE # 2KR*4MR + + MADPS C13, C13, A1, B3 + MADPS C14, C14, A1, B4 + + .align 4 +.L420: + CVTU A1, C13 # A1=C13.upper=c12 + LD B1, 1 * SIZE(CO1) + + CVTU A2, C11 # A2=C11.upper=c22 + LD B2, 1 * SIZE(CO2) + + MADD A1, B1, A1, ALPHA # c12 + LD B5, 0 * SIZE(CO1) + + MADD A2, B2, A2, ALPHA # c22 + LD B6, 0 * SIZE(CO2) + + MADD C11, B5, C11, ALPHA # c12 + ST A1, 1 * SIZE(CO1) + + MADD C13, B6, C13, ALPHA # c22 + ST A2, 1 * SIZE(CO2) + + ST C11, 0 * SIZE(CO1) + ST C13, 0 * SIZE(CO2) + + CVTU A1, C14 # B1=C12.upper=c42 + LD B1, 1 * SIZE(CO3) + + CVTU A2, C12 # B2=C14.upper=c32 + LD B2, 1 * SIZE(CO4) + + MADD A1, B1, A1, ALPHA # c31 + LD A5, 0 * SIZE(CO3) + + MADD A2, B2, A2, ALPHA + LD A6, 0 * SIZE(CO4) + + MADD C12, A5, C12, ALPHA + ST A1, 1 * SIZE(CO3) + + MADD C14, A6, C14, ALPHA + ST A2, 1 * SIZE(CO4) + + ST C12, 0 * SIZE(CO3) + ST C14, 0 * SIZE(CO4) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + + + .align 4 +.L41: + andi I, M, 1 + blez I, .L40 + NOP + + .align 4 +.L411: + move BO, B # Reset B + dsra L, K, 2 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD B1, 0 * SIZE(BO) + + MOV C21, C11 + MOV C22, C11 + LD A1, 0 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B2, 1 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B3, 2 * SIZE(BO) + + MOV C13, C11 + MOV C14, C11 + LD B4, 3 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L412 + MOV C44, C11 + +.L4110: + daddiu L, L, -1 + LD A2, 1 * SIZE(AO) + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A3, 2 * SIZE(AO) + NOP + + MADD C11, C11, A2, B5 + LD B1, 8 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 9 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 10 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 11 * SIZE(BO) + + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B1 + LD B5, 12 * SIZE(BO) + + MADD C12, C12, A3, B2 + LD B6, 13 * SIZE(BO) + + MADD C13, C13, A3, B3 + LD B7, 14 * SIZE(BO) + + MADD C14, C14, A3, B4 + LD B8, 15 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD C11, C11, A4, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A4, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A4, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A4, B8 + bgtz L, .L4110 + LD B4, 3 * SIZE(BO) + +.L412: + andi L, K, 2 + blez L, .L413 + NOP + + LD A2, 1 * SIZE(AO) + daddiu AO, AO, 2 * SIZE + + MADD C11, C11, A1, B1 + LD B5, 4 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD B6, 5 * SIZE(BO) + + MADD C13, C13, A1, B3 + LD B7, 6 * SIZE(BO) + + MADD C14, C14, A1, B4 + LD B8, 7 * SIZE(BO) + + LD A1, 0 * SIZE(AO) + daddiu BO, BO, 8 * SIZE + + MADD C11, C11, A2, B5 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A2, B6 + LD B2, 1 * SIZE(BO) + + MADD C13, C13, A2, B7 + LD B3, 2 * SIZE(BO) + + MADD C14, C14, A2, B8 + LD B4, 3 * SIZE(BO) + +.L413: + andi L, K, 1 + blez L, .L410 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + MADD C13, C13, A1, B3 + MADD C14, C14, A1, B4 + daddiu BO, BO, 4 * SIZE + + .align 4 +.L410: + LD A5, 0 * SIZE(CO1) + LD A6, 0 * SIZE(CO2) + LD A7, 0 * SIZE(CO3) + LD A8, 0 * SIZE(CO4) + + MADD A5, A5, C11, ALPHA + MADD A6, A6, C12, ALPHA + MADD A7, A7, C13, ALPHA + MADD A8, A8, C14, ALPHA + + ST A5, 0 * SIZE(CO1) + ST A6, 0 * SIZE(CO2) + ST A7, 0 * SIZE(CO3) + ST A8, 0 * SIZE(CO4) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + + .align 4 .L40: daddiu J, J, -1 move B, BO - bgtz J, .L48 NOP + + .align 4 .L2: # Nr=2 andi J, N, 2 blez J, .L1 NOP +.L28: + dsra I, M, 3 # MR=8 + + move AO, A # Reset A + move CO1, C + + daddu CO2, C, LDC + blez I, .L24 + daddu C, CO2, LDC + + + .align 4 +.L281: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C13, C11 + LD A7, 6 * SIZE(AO) + + MOV C14, C11 + LD A8, 7 * SIZE(AO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L282 + MOV C44, C11 + + + .align 4 +.L2810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + LD B3, 2 * SIZE(BO) + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + LD A2, 13 * SIZE(AO) + + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + LD A3, 14 * SIZE(AO) + + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B3 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B3 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B3 + MADD C23, C23, A2, B3 + LD A7, 6 * SIZE(AO) + + MADD C33, C33, A3, B3 + MADD C43, C43, A4, B3 + LD A8, 7 * SIZE(AO) + + MADD C14, C14, A1, B4 + MADD C24, C24, A2, B4 + LD B1, 0 * SIZE(BO) + + MADD C34, C34, A3, B4 + MADD C44, C44, A4, B4 + LD B2, 1 * SIZE(BO) + + MADD C31, C31, B7, B3 + MADD C41, C41, B8, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, B5, B4 + LD A2, 1 * SIZE(AO) + + MADD C22, C22, B6, B4 + LD A3, 2 * SIZE(AO) + + LD A4, 3 * SIZE(AO) + MADD C32, C32, B7, B4 + bgtz L, .L2810 + MADD C42, C42, B8, B4 + + .align 4 +.L282: + andi L, K, 1 + blez L, .L280 + LD ALPHA, 152($fp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + MADD C14, C14, A5, B2 + MADD C24, C24, A6, B2 + MADD C34, C34, A7, B2 + MADD C44, C44, A8, B2 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L280: # Write Back + daddiu I, I, -1 + + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + LD A5, 4 * SIZE(CO1) + LD A6, 5 * SIZE(CO1) + LD A7, 6 * SIZE(CO1) + LD A8, 7 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD A5, A5, C13, ALPHA + LD B5, 4 * SIZE(CO2) + + MADD A6, A6, C23, ALPHA + LD B6, 5 * SIZE(CO2) + + MADD A7, A7, C33, ALPHA + LD B7, 6 * SIZE(CO2) + + MADD A8, A8, C43, ALPHA + LD C11, 7 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + MADD B5, B5, C14, ALPHA + ST A5, 4 * SIZE(CO1) + + MADD B6, B6, C24, ALPHA + ST A6, 5 * SIZE(CO1) + + MADD B7, B7, C34, ALPHA + ST A7, 6 * SIZE(CO1) + + MADD C11, C11, C44, ALPHA + ST A8, 7 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + ST B5, 4 * SIZE(CO2) + ST B6, 5 * SIZE(CO2) + ST B7, 6 * SIZE(CO2) + ST C11, 7 * SIZE(CO2) + + daddiu CO1, CO1, 8 * SIZE + bgtz I, .L281 + daddiu CO2, CO2, 8 * SIZE + + + .align 4 +.L24: + andi I, M, 4 # MR=4 + blez I, .L22 + NOP + + .align 4 +.L241: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + LD B2, 1 * SIZE(BO) + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L242 + MOV C44, C11 + + + .align 4 +.L2410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD B4, 3 * SIZE(BO) + + MADD C41, C41, A4, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 4 * SIZE + + MADD C12, C12, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C22, C22, A2, B2 + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD B2, 1 * SIZE(BO) + + MADD C41, C41, A8, B3 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A5, B4 + LD A3, 2 * SIZE(AO) + + MADD C22, C22, A6, B4 + LD A4, 3 * SIZE(AO) + + MADD C32, C32, A7, B4 + bgtz L, .L2410 + MADD C42, C42, A8, B4 + + .align 4 +.L242: + andi L, K, 1 + blez L, .L240 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + MADD C32, C32, A3, B2 + MADD C42, C42, A4, B2 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L240: # Write Back + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + LD A3, 2 * SIZE(CO1) + LD A4, 3 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD A3, A3, C31, ALPHA + LD B3, 2 * SIZE(CO2) + + MADD A4, A4, C41, ALPHA + LD B4, 3 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + MADD B3, B3, C32, ALPHA + ST A3, 2 * SIZE(CO1) + + MADD B4, B4, C42, ALPHA + ST A4, 3 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + ST B3, 2 * SIZE(CO2) + ST B4, 3 * SIZE(CO2) + + daddiu CO1, CO1, 4 * SIZE + daddiu CO2, CO2, 4 * SIZE + + .align 4 +.L22: + andi I, M, 2 + blez I, .L21 + NOP + + .align 4 +.L221: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L222 + MOV C44, C11 + + + .align 4 +.L2210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A3, 2 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 2 * SIZE(BO) + + MADD C12, C12, A1, B2 + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C22, C22, A2, B2 + LD B4, 3 * SIZE(BO) + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A4, B3 + LD B1, 0 * SIZE(BO) + + MADD C12, C12, A3, B4 + LD B2, 1 * SIZE(BO) + + MADD C22, C22, A4, B4 + bgtz L, .L2210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L222: + andi L, K, 1 + blez L, .L220 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C12, C12, A1, B2 + MADD C22, C22, A2, B2 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L220: # Write Back + LD A1, 0 * SIZE(CO1) + LD A2, 1 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD A2, A2, C21, ALPHA + LD B2, 1 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + MADD B2, B2, C22, ALPHA + ST A2, 1 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + ST B2, 1 * SIZE(CO2) + + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + + + .align 4 +.L21: + andi I, M, 1 + blez I, .L20 + NOP + + .align 4 +.L211: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + LD B2, 1 * SIZE(BO) + + MOV C43, C11 + blez L, .L212 + MOV C44, C11 + + + .align 4 +.L2110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A2, 1 * SIZE(AO) + + MADD C12, C12, A1, B2 + LD B3, 2 * SIZE(BO) + + LD B4, 3 * SIZE(BO) + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 4 * SIZE + + MADD C11, C11, A2, B3 + LD A1, 0 * SIZE(AO) + + MADD C12, C12, A2, B4 + LD B1, 0 * SIZE(BO) + + bgtz L, .L2110 + LD B2, 1 * SIZE(BO) + + + .align 4 +.L212: + andi L, K, 1 + blez L, .L210 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C12, C12, A1, B2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 2 * SIZE + + + .align 4 +.L210: # Write Back + LD A1, 0 * SIZE(CO1) + + MADD A1, A1, C11, ALPHA + LD B1, 0 * SIZE(CO2) + + MADD B1, B1, C12, ALPHA + ST A1, 0 * SIZE(CO1) + + ST B1, 0 * SIZE(CO2) + + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + + + .align 4 +.L20: + move B, BO + NOP + .align 4 @@ -588,6 +6026,429 @@ blez J, .L999 NOP +.L18: + dsra I, M, 3 # MR=8 + move AO, A # Reset A + blez I, .L14 + NOP + + + .align 4 +.L181: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + LD A1, 0 * SIZE(AO) + + MOV C12, C11 + LD A2, 1 * SIZE(AO) + + MOV C21, C11 + LD A3, 2 * SIZE(AO) + + MOV C22, C11 + LD A4, 3 * SIZE(AO) + + MOV C31, C11 + LD A5, 4 * SIZE(AO) + + MOV C32, C11 + LD A6, 5 * SIZE(AO) + + MOV C41, C11 + LD B1, 0 * SIZE(BO) + + MOV C42, C11 + LD A7, 6 * SIZE(AO) + + MOV C13, C11 + LD A8, 7 * SIZE(AO) + + MOV C14, C11 + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L182 + MOV C44, C11 + + + .align 4 +.L1810: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B5, 8 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B6, 9 * SIZE(AO) + + MADD C31, C31, A3, B1 + LD B7, 10 * SIZE(AO) + + MADD C41, C41, A4, B1 + LD B8, 11 * SIZE(AO) + + MADD C13, C13, A5, B1 + LD B2, 1 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + + MADD C23, C23, A6, B1 + LD A1, 12 * SIZE(AO) + + MADD C33, C33, A7, B1 + LD A2, 13 * SIZE(AO) + + MADD C43, C43, A8, B1 + LD A3, 14 * SIZE(AO) + + LD A4, 15 * SIZE(AO) + daddiu AO, AO, 16 * SIZE + + MADD C11, C11, B5, B2 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, B6, B2 + LD A6, 5 * SIZE(AO) + + MADD C13, C13, A1, B2 + LD A7, 6 * SIZE(AO) + + MADD C23, C23, A2, B2 + LD A8, 7 * SIZE(AO) + + MADD C33, C33, A3, B2 + LD B1, 0 * SIZE(BO) + + MADD C43, C43, A4, B2 + LD A1, 0 * SIZE(AO) + + MADD C31, C31, B7, B2 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, B8, B2 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1810 + LD A4, 3 * SIZE(AO) + + .align 4 +.L182: + andi L, K, 1 + blez L, .L180 + LD ALPHA, 152($fp) + + MADD C13, C13, A5, B1 + MADD C23, C23, A6, B1 + MADD C33, C33, A7, B1 + MADD C43, C43, A8, B1 + daddiu AO, AO, 8 * SIZE + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L180: # Write Back + daddiu I, I, -1 + + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + LD A5, 4 * SIZE(C) + LD A6, 5 * SIZE(C) + LD A7, 6 * SIZE(C) + LD A8, 7 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + MADD A5, A5, C13, ALPHA + MADD A6, A6, C23, ALPHA + MADD A7, A7, C33, ALPHA + MADD A8, A8, C43, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + ST A5, 4 * SIZE(C) + ST A6, 5 * SIZE(C) + ST A7, 6 * SIZE(C) + ST A8, 7 * SIZE(C) + + daddiu C, C, 8 * SIZE + bgtz I, .L181 + NOP + + + .align 4 +.L14: + andi I, M, 4 # MR=4 + blez I, .L12 + NOP + + .align 4 +.L141: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD A3, 2 * SIZE(AO) + + MOV C41, C11 + MOV C42, C11 + LD A4, 3 * SIZE(AO) + + MOV C13, C11 + MOV C14, C11 + LD B1, 0 * SIZE(BO) + + MOV C23, C11 + MOV C24, C11 + + MOV C33, C11 + MOV C34, C11 + + MOV C43, C11 + blez L, .L142 + MOV C44, C11 + + + .align 4 +.L1410: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD A5, 4 * SIZE(AO) + + MADD C21, C21, A2, B1 + LD B3, 1 * SIZE(BO) + + MADD C31, C31, A3, B1 + LD A6, 5 * SIZE(AO) + daddiu BO, BO, 2 * SIZE + + MADD C41, C41, A4, B1 + LD A7, 6 * SIZE(AO) + + LD A8, 7 * SIZE(AO) + daddiu AO, AO, 8 * SIZE + + + MADD C11, C11, A5, B3 + LD A1, 0 * SIZE(AO) + + MADD C21, C21, A6, B3 + LD B1, 0 * SIZE(BO) + + MADD C31, C31, A7, B3 + LD A2, 1 * SIZE(AO) + + MADD C41, C41, A8, B3 + LD A3, 2 * SIZE(AO) + + bgtz L, .L1410 + LD A4, 3 * SIZE(AO) + + .align 4 +.L142: + andi L, K, 1 + blez L, .L140 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + MADD C31, C31, A3, B1 + MADD C41, C41, A4, B1 + daddiu AO, AO, 4 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L140: # Write Back + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + LD A3, 2 * SIZE(C) + LD A4, 3 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + MADD A3, A3, C31, ALPHA + MADD A4, A4, C41, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + ST A3, 2 * SIZE(C) + ST A4, 3 * SIZE(C) + daddiu C, C, 4 * SIZE + + .align 4 +.L12: + andi I, M, 2 + blez I, .L11 + NOP + + .align 4 +.L121: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD A2, 1 * SIZE(AO) + + MOV C31, C11 + MOV C32, C11 + LD B1, 0 * SIZE(BO) + + MOV C41, C11 + MOV C42, C11 + + MOV C43, C11 + blez L, .L122 + MOV C44, C11 + + + .align 4 +.L1210: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + LD B3, 1 * SIZE(BO) + + MADD C21, C21, A2, B1 + daddiu BO, BO, 2 * SIZE + + LD A3, 2 * SIZE(AO) + LD A4, 3 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD C11, C11, A3, B3 + LD B1, 0 * SIZE(BO) + + MADD C21, C21, A4, B3 + LD A1, 0 * SIZE(AO) + bgtz L, .L1210 + LD A2, 1 * SIZE(AO) + + + .align 4 +.L122: + andi L, K, 1 + blez L, .L120 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + MADD C21, C21, A2, B1 + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L120: # Write Back + LD A1, 0 * SIZE(C) + LD A2, 1 * SIZE(C) + + MADD A1, A1, C11, ALPHA + MADD A2, A2, C21, ALPHA + + ST A1, 0 * SIZE(C) + ST A2, 1 * SIZE(C) + + daddiu C, C, 2 * SIZE + + + .align 4 +.L11: + andi I, M, 1 + blez I, .L10 + NOP + + .align 4 +.L111: + move BO, B # Reset B + dsra L, K, 1 # UnRoll K=4 + + MTC $0, C11 # CLEAR REAULTS REGISTERS + MOV C12, C11 + LD A1, 0 * SIZE(AO) + + MOV C21, C11 + MOV C22, C11 + LD B1, 0 * SIZE(BO) + + MOV C31, C11 + blez L, .L112 + MOV C32, C11 + + + + .align 4 +.L1110: + daddiu L, L, -1 + MADD C11, C11, A1, B1 + + LD A2, 1 * SIZE(AO) + LD B2, 1 * SIZE(BO) + + daddiu AO, AO, 2 * SIZE + daddiu BO, BO, 2 * SIZE + + MADD C11, C11, A2, B2 + LD A1, 0 * SIZE(AO) + LD B1, 0 * SIZE(BO) + + bgtz L, .L1110 + NOP + + + .align 4 +.L112: + andi L, K, 1 + blez L, .L110 + LD ALPHA, 152($fp) + + MADD C11, C11, A1, B1 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 1 * SIZE + + + .align 4 +.L110: # Write Back + LD A1, 0 * SIZE(C) + + MADD A1, A1, C11, ALPHA + + ST A1, 0 * SIZE(C) + + daddiu C, C, 1 * SIZE + + + .align 4 +.L10: + move B, BO + NOP .L999: @@ -627,6 +6488,6 @@ EPILOGUE # .set macro # .set reorder -# .end REALNAME -# .size REALNAME, .-REALNAME -#.ident "GCC: (Debian 4.4.6-6) 4.4.6" +# .end gemm +# .size gemm, .-gemm +# .ident "GCC: (Debian 4.4.6-6) 4.4.6"