Files
OpenBLAS/kernel/riscv64/sgemm_kernel_16x4_c910v.c
damonyu ef8e7d0279 Add the support for RISC-V Vector.
Change-Id: Iae7800a32f5af3903c330882cdf6f292d885f266
2020-10-15 16:09:02 +08:00

1576 lines
42 KiB
C

#include "common.h"
#include <riscv-vector.h>
#define KERNEL16x4_I \
"addi t1, %[PB], 1*4 \n\t"\
"addi t2, %[PB], 2*4 \n\t"\
"addi t3, %[PB], 3*4 \n\t"\
"flw ft0, (%[PB]) \n\t"\
"flw ft1, (t1) \n\t"\
"flw ft2, (t2) \n\t"\
"flw ft3, (t3) \n\t"\
"vle.v v0, (%[PA]) \n\t"\
"addi t4, %[PA], 4*4 \n\t"\
"addi t5, %[PA], 8*4 \n\t"\
"vfmv.v.f v8, ft0 \n\t"\
"addi t6, %[PA], 12*4 \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vle.v v1, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmv.v.f v9, ft1 \n\t"\
"vle.v v2, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vle.v v3, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"vfmv.v.f v10, ft2 \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"vle.v v4, (%[PA]) \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vfmv.v.f v11, ft3 \n\t"\
"vfmacc.vv v16, v8, v0 \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vle.v v5, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmacc.vv v17, v8, v1 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"vle.v v6, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vfmacc.vv v18, v8, v2 \n\t"\
"addi t3, t3, 4*4 \n\t"\
"vle.v v7, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"vfmacc.vv v19, v8, v3 \n\t"\
"flw ft4, (%[PB]) \n\t"\
"vfmacc.vv v20, v9, v0 \n\t"\
"flw ft5, (t1) \n\t"\
"vfmacc.vv v21, v9, v1 \n\t"\
"flw ft6, (t2) \n\t"\
"vfmacc.vv v22, v9, v2 \n\t"\
"flw ft7, (t3) \n\t"\
"vfmacc.vv v23, v9, v3 \n\t"\
"vfmv.v.f v12, ft4 \n\t"\
"vfmacc.vv v24, v10, v0 \n\t"\
"vfmv.v.f v13, ft5 \n\t"\
"vfmacc.vv v25, v10, v1 \n\t"\
"vfmv.v.f v14, ft6 \n\t"\
"vfmacc.vv v26, v10, v2 \n\t"\
"vfmv.v.f v15, ft7 \n\t"\
"vfmacc.vv v27, v10, v3 \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"vfmacc.vv v28, v11, v0 \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vfmacc.vv v29, v11, v1 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"vfmacc.vv v30, v11, v2 \n\t"\
"addi t3, t3, 4*4 \n\t"\
"vfmacc.vv v31, v11, v3 \n\t"
#define KERNEL16x4_M1 \
"vfmacc.vv v16, v8, v0 \n\t"\
"vle.v v4, (%[PA]) \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vfmacc.vv v17, v8, v1 \n\t"\
"vle.v v5, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmacc.vv v18, v8, v2 \n\t"\
"vle.v v6, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vfmacc.vv v19, v8, v3 \n\t"\
"vle.v v7, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"vfmacc.vv v20, v9, v0 \n\t"\
"flw ft4, (%[PB]) \n\t"\
"vfmacc.vv v21, v9, v1 \n\t"\
"flw ft5, (t1) \n\t"\
"vfmacc.vv v22, v9, v2 \n\t"\
"flw ft6, (t2) \n\t"\
"vfmacc.vv v23, v9, v3 \n\t"\
"flw ft7, (t3) \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"vfmacc.vv v24, v10, v0 \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vfmacc.vv v25, v10, v1 \n\t"\
"vfmv.v.f v12, ft4 \n\t"\
"vfmacc.vv v26, v10, v2 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"vfmacc.vv v27, v10, v3 \n\t"\
"vfmv.v.f v13, ft5 \n\t"\
"vfmacc.vv v28, v11, v0 \n\t"\
"addi t3, t3, 4*4 \n\t"\
"vfmacc.vv v29, v11, v1 \n\t"\
"vfmv.v.f v14, ft6 \n\t"\
"vfmacc.vv v30, v11, v2 \n\t"\
"vfmacc.vv v31, v11, v3 \n\t"\
"vfmv.v.f v15, ft7 \n\t"
#define KERNEL16x4_M2 \
"vfmacc.vv v16, v12, v4 \n\t"\
"vle.v v0, (%[PA]) \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vfmacc.vv v17, v12, v5 \n\t"\
"vle.v v1, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmacc.vv v18, v12, v6 \n\t"\
"vle.v v2, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vfmacc.vv v19, v12, v7 \n\t"\
"vle.v v3, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"vfmacc.vv v20, v13, v4 \n\t"\
"flw ft0, (%[PB]) \n\t"\
"vfmacc.vv v21, v13, v5 \n\t"\
"flw ft1, (t1) \n\t"\
"vfmacc.vv v22, v13, v6 \n\t"\
"flw ft2, (t2) \n\t"\
"vfmacc.vv v23, v13, v7 \n\t"\
"flw ft3, (t3) \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"vfmacc.vv v24, v14, v4 \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vfmacc.vv v25, v14, v5 \n\t"\
"vfmv.v.f v8, ft0 \n\t"\
"vfmacc.vv v26, v14, v6 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"vfmacc.vv v27, v14, v7 \n\t"\
"vfmv.v.f v9, ft1 \n\t"\
"vfmacc.vv v28, v15, v4 \n\t"\
"addi t3, t3, 4*4 \n\t"\
"vfmacc.vv v29, v15, v5 \n\t"\
"vfmv.v.f v10, ft2 \n\t"\
"vfmacc.vv v30, v15, v6 \n\t"\
"vfmacc.vv v31, v15, v7 \n\t"\
"vfmv.v.f v11, ft3 \n\t"
#define KERNEL16x4_E \
"vfmacc.vv v16, v12, v4 \n\t"\
"vfmacc.vv v17, v12, v5 \n\t"\
"vfmacc.vv v18, v12, v6 \n\t"\
"vfmacc.vv v19, v12, v7 \n\t"\
"vfmacc.vv v20, v13, v4 \n\t"\
"vfmacc.vv v21, v13, v5 \n\t"\
"vfmacc.vv v22, v13, v6 \n\t"\
"vfmacc.vv v23, v13, v7 \n\t"\
"vfmacc.vv v24, v14, v4 \n\t"\
"vfmacc.vv v25, v14, v5 \n\t"\
"vfmacc.vv v26, v14, v6 \n\t"\
"vfmacc.vv v27, v14, v7 \n\t"\
"vfmacc.vv v28, v15, v4 \n\t"\
"vfmacc.vv v29, v15, v5 \n\t"\
"vfmacc.vv v30, v15, v6 \n\t"\
"vfmacc.vv v31, v15, v7 \n\t"
#define KERNEL8x4_I \
"addi t1, %[PB], 1*4 \n\t"\
"addi t2, %[PB], 2*4 \n\t"\
"addi t3, %[PB], 3*4 \n\t"\
"flw ft0, (%[PB]) \n\t"\
"flw ft1, (t1) \n\t"\
"flw ft2, (t2) \n\t"\
"flw ft3, (t3) \n\t"\
"vle.v v0, (%[PA]) \n\t"\
"addi t4, %[PA], 4*4 \n\t"\
"vfmv.v.f v8, ft0 \n\t"\
"addi %[PA], %[PA], 8*4 \n\t"\
"vle.v v1, (t4) \n\t"\
"addi t4, t4, 8*4 \n\t"\
"vfmv.v.f v9, ft1 \n\t"\
"vfmv.v.f v10, ft2 \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"vle.v v4, (%[PA]) \n\t"\
"addi %[PA], %[PA], 8*4 \n\t"\
"vfmv.v.f v11, ft3 \n\t"\
"vfmacc.vv v16, v8, v0 \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vle.v v5, (t4) \n\t"\
"addi t4, t4, 8*4 \n\t"\
"vfmacc.vv v17, v8, v1 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"flw ft4, (%[PB]) \n\t"\
"addi t3, t3, 4*4 \n\t"\
"vfmacc.vv v20, v9, v0 \n\t"\
"flw ft5, (t1) \n\t"\
"vfmacc.vv v21, v9, v1 \n\t"\
"flw ft6, (t2) \n\t"\
"vfmv.v.f v12, ft4 \n\t"\
"flw ft7, (t3) \n\t"\
"vfmacc.vv v24, v10, v0 \n\t"\
"vfmv.v.f v13, ft5 \n\t"\
"vfmacc.vv v25, v10, v1 \n\t"\
"vfmv.v.f v14, ft6 \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"vfmv.v.f v15, ft7 \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vfmacc.vv v28, v11, v0 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"vfmacc.vv v29, v11, v1 \n\t"\
"addi t3, t3, 4*4 \n\t"
#define KERNEL8x4_M1 \
"vfmacc.vv v16, v8, v0 \n\t"\
"vle.v v4, (%[PA]) \n\t"\
"addi %[PA], %[PA], 8*4 \n\t"\
"vfmacc.vv v17, v8, v1 \n\t"\
"vle.v v5, (t4) \n\t"\
"addi t4, t4, 8*4 \n\t"\
"vfmacc.vv v20, v9, v0 \n\t"\
"flw ft4, (%[PB]) \n\t"\
"vfmacc.vv v21, v9, v1 \n\t"\
"flw ft5, (t1) \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"flw ft6, (t2) \n\t"\
"vfmacc.vv v24, v10, v0 \n\t"\
"flw ft7, (t3) \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vfmacc.vv v25, v10, v1 \n\t"\
"vfmv.v.f v12, ft4 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"vfmv.v.f v13, ft5 \n\t"\
"vfmacc.vv v28, v11, v0 \n\t"\
"addi t3, t3, 4*4 \n\t"\
"vfmacc.vv v29, v11, v1 \n\t"\
"vfmv.v.f v14, ft6 \n\t"\
"vfmv.v.f v15, ft7 \n\t"
#define KERNEL8x4_M2 \
"vfmacc.vv v16, v12, v4 \n\t"\
"vle.v v0, (%[PA]) \n\t"\
"addi %[PA], %[PA], 8*4 \n\t"\
"vfmacc.vv v17, v12, v5 \n\t"\
"vle.v v1, (t4) \n\t"\
"addi t4, t4, 8*4 \n\t"\
"vfmacc.vv v20, v13, v4 \n\t"\
"flw ft0, (%[PB]) \n\t"\
"vfmacc.vv v21, v13, v5 \n\t"\
"flw ft1, (t1) \n\t"\
"addi %[PB], %[PB], 4*4 \n\t"\
"flw ft2, (t2) \n\t"\
"vfmacc.vv v24, v14, v4 \n\t"\
"flw ft3, (t3) \n\t"\
"addi t1, t1, 4*4 \n\t"\
"vfmacc.vv v25, v14, v5 \n\t"\
"vfmv.v.f v8, ft0 \n\t"\
"addi t2, t2, 4*4 \n\t"\
"vfmv.v.f v9, ft1 \n\t"\
"vfmacc.vv v28, v15, v4 \n\t"\
"addi t3, t3, 4*4 \n\t"\
"vfmacc.vv v29, v15, v5 \n\t"\
"vfmv.v.f v10, ft2 \n\t"\
"vfmv.v.f v11, ft3 \n\t"
#define KERNEL8x4_E \
"vfmacc.vv v16, v12, v4 \n\t"\
"vfmacc.vv v17, v12, v5 \n\t"\
"vfmacc.vv v20, v13, v4 \n\t"\
"vfmacc.vv v21, v13, v5 \n\t"\
"vfmacc.vv v24, v14, v4 \n\t"\
"vfmacc.vv v25, v14, v5 \n\t"\
"vfmacc.vv v28, v15, v4 \n\t"\
"vfmacc.vv v29, v15, v5 \n\t"
#define KERNEL16x2_I \
"addi t1, %[PB], 1*4 \n\t"\
"flw ft0, (%[PB]) \n\t"\
"flw ft1, (t1) \n\t"\
"vle.v v0, (%[PA]) \n\t"\
"addi t4, %[PA], 4*4 \n\t"\
"addi t5, %[PA], 8*4 \n\t"\
"vfmv.v.f v8, ft0 \n\t"\
"addi t6, %[PA], 12*4 \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vle.v v1, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmv.v.f v9, ft1 \n\t"\
"vle.v v2, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vle.v v3, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"addi %[PB], %[PB], 2*4 \n\t"\
"vle.v v4, (%[PA]) \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vfmacc.vv v16, v8, v0 \n\t"\
"addi t1, t1, 2*4 \n\t"\
"vle.v v5, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmacc.vv v17, v8, v1 \n\t"\
"vle.v v6, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vfmacc.vv v18, v8, v2 \n\t"\
"vle.v v7, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"vfmacc.vv v19, v8, v3 \n\t"\
"flw ft4, (%[PB]) \n\t"\
"vfmacc.vv v20, v9, v0 \n\t"\
"flw ft5, (t1) \n\t"\
"vfmacc.vv v21, v9, v1 \n\t"\
"addi %[PB], %[PB], 2*4 \n\t"\
"vfmacc.vv v22, v9, v2 \n\t"\
"addi t1, t1, 2*4 \n\t"\
"vfmacc.vv v23, v9, v3 \n\t"\
"vfmv.v.f v12, ft4 \n\t"\
"vfmv.v.f v13, ft5 \n\t"
#define KERNEL16x2_M1 \
"vfmacc.vv v16, v8, v0 \n\t"\
"vle.v v4, (%[PA]) \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vfmacc.vv v17, v8, v1 \n\t"\
"vle.v v5, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmacc.vv v18, v8, v2 \n\t"\
"vle.v v6, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vfmacc.vv v19, v8, v3 \n\t"\
"vle.v v7, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"flw ft4, (%[PB]) \n\t"\
"vfmacc.vv v20, v9, v0 \n\t"\
"flw ft5, (t1) \n\t"\
"vfmacc.vv v21, v9, v1 \n\t"\
"vfmv.v.f v12, ft4 \n\t"\
"vfmacc.vv v22, v9, v2 \n\t"\
"addi t1, t1, 2*4 \n\t"\
"vfmacc.vv v23, v9, v3 \n\t"\
"addi %[PB], %[PB], 2*4 \n\t"\
"vfmv.v.f v13, ft5 \n\t"
#define KERNEL16x2_M2 \
"vfmacc.vv v16, v12, v4 \n\t"\
"vle.v v0, (%[PA]) \n\t"\
"addi %[PA], %[PA], 16*4 \n\t"\
"vfmacc.vv v17, v12, v5 \n\t"\
"vle.v v1, (t4) \n\t"\
"addi t4, t4, 16*4 \n\t"\
"vfmacc.vv v18, v12, v6 \n\t"\
"vle.v v2, (t5) \n\t"\
"addi t5, t5, 16*4 \n\t"\
"vfmacc.vv v19, v12, v7 \n\t"\
"vle.v v3, (t6) \n\t"\
"addi t6, t6, 16*4 \n\t"\
"vfmacc.vv v20, v13, v4 \n\t"\
"flw ft0, (%[PB]) \n\t"\
"vfmacc.vv v21, v13, v5 \n\t"\
"flw ft1, (t1) \n\t"\
"vfmacc.vv v22, v13, v6 \n\t"\
"vfmv.v.f v8, ft0 \n\t"\
"vfmacc.vv v23, v13, v7 \n\t"\
"addi %[PB], %[PB], 2*4 \n\t"\
"addi t1, t1, 2*4 \n\t"\
"vfmv.v.f v9, ft1 \n\t"
#define KERNEL16x2_E \
"vfmacc.vv v16, v12, v4 \n\t"\
"vfmacc.vv v17, v12, v5 \n\t"\
"vfmacc.vv v18, v12, v6 \n\t"\
"vfmacc.vv v19, v12, v7 \n\t"\
"vfmacc.vv v20, v13, v4 \n\t"\
"vfmacc.vv v21, v13, v5 \n\t"\
"vfmacc.vv v22, v13, v6 \n\t"\
"vfmacc.vv v23, v13, v7 \n\t"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*C2,*C3;
FLOAT *ptrba,*ptrbb;
FLOAT loadb0,loadb1,loadb2,loadb3;
FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
FLOAT res0,res1,res2,res3;
FLOAT res4,res5,res6,res7;
FLOAT res8,res9,res10,res11;
FLOAT res12,res13,res14,res15;
for (j=0; j<bn/4; j+=1){
C0 = C;
C1 = C0+ldc;
C2 = C1+ldc;
C3 = C2+ldc;
ptrba = ba;
for(i=0; i<bm/16; i+=1){
ptrbb = bb;
//t0 for k
//ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
//v0-v3,v4-v7 for A, t4-t6 for PA1-3
//v16-v31 for temp C
asm volatile(
"vsetvli zero, zero, e32,m1 \n\t"
"fmv.w.x ft11, zero \n\t"
"mv t0, %[BK] \n\t"
"vfmv.v.f v16, ft11 \n\t"
"vfmv.v.f v17, ft11 \n\t"
"vfmv.v.f v18, ft11 \n\t"
"vfmv.v.f v19, ft11 \n\t"
"vfmv.v.f v20, ft11 \n\t"
"vfmv.v.f v21, ft11 \n\t"
"vfmv.v.f v22, ft11 \n\t"
"vfmv.v.f v23, ft11 \n\t"
"vfmv.v.f v24, ft11 \n\t"
"vfmv.v.f v25, ft11 \n\t"
"vfmv.v.f v26, ft11 \n\t"
"vfmv.v.f v27, ft11 \n\t"
"vfmv.v.f v28, ft11 \n\t"
"vfmv.v.f v29, ft11 \n\t"
"vfmv.v.f v30, ft11 \n\t"
"vfmv.v.f v31, ft11 \n\t"
//unloop 8
"srli t0, %[BK], 3 \n\t"
"blez t0, M16x4_TAIL \n\t"
//preloop
KERNEL16x4_I
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_M2
"addi t0, t0, -1 \n\t"
"blez t0, M16x4_MAINLOOP_TAIL \n\t"
".align 4 \n\t"
"M16x4_MAINLOOP: \n\t"
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_M2
"addi t0, t0, -1 \n\t"
"bgtz t0, M16x4_MAINLOOP \n\t"
"M16x4_MAINLOOP_TAIL: \n\t"
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
//tail
"M16x4_TAIL: \n\t"
"andi t0, %[BK], 7 \n\t"
"blez t0, M16x4_SAVERESULT \n\t"
"addi t4, %[PA], 4*4 \n\t"
"addi t5, %[PA], 8*4 \n\t"
"addi t6, %[PA], 12*4 \n\t"
"addi t1, %[PB], 1*4 \n\t"
"addi t2, %[PB], 2*4 \n\t"
"addi t3, %[PB], 3*4 \n\t"
".align 4 \n\t"
"M16x4_TAILLOOP: \n\t"
"flw ft0, (%[PB]) \n\t"
"addi %[PB], %[PB], 4*4 \n\t"
"vle.v v0, (%[PA]) \n\t"
"add %[PA], %[PA], 16*4 \n\t"
"vle.v v1, (t4) \n\t"
"addi t4, t4, 16*4 \n\t"
"vfmv.v.f v8, ft0 \n\t"
"flw ft1, (t1) \n\t"
"addi t1, t1, 4*4 \n\t"
"vle.v v2, (t5) \n\t"
"addi t5, t5, 16*4 \n\t"
"vle.v v3, (t6) \n\t"
"addi t6, t6, 16*4 \n\t"
"vfmacc.vv v16, v8, v0 \n\t"
"flw ft2, (t2) \n\t"
"addi t2, t2, 4*4 \n\t"
"vfmacc.vv v17, v8, v1 \n\t"
"vfmacc.vv v18, v8, v2 \n\t"
"vfmv.v.f v9, ft1 \n\t"
"vfmacc.vv v19, v8, v3 \n\t"
"vfmacc.vv v20, v9, v0 \n\t"
"flw ft3, (t3) \n\t"
"addi t3, t3, 4*4 \n\t"
"vfmacc.vv v21, v9, v1 \n\t"
"vfmacc.vv v22, v9, v2 \n\t"
"vfmv.v.f v10, ft2 \n\t"
"vfmacc.vv v23, v9, v3 \n\t"
"vfmv.v.f v11, ft3 \n\t"
"vfmacc.vv v24, v10, v0 \n\t"
"vfmacc.vv v25, v10, v1 \n\t"
"vfmacc.vv v26, v10, v2 \n\t"
"vfmacc.vv v27, v10, v3 \n\t"
"vfmacc.vv v28, v11, v0 \n\t"
"vfmacc.vv v29, v11, v1 \n\t"
"vfmacc.vv v30, v11, v2 \n\t"
"vfmacc.vv v31, v11, v3 \n\t"
"addi t0, t0, -1 \n\t"
"bgtz t0, M16x4_TAILLOOP \n\t"
//Save result
//load C
"M16x4_SAVERESULT: \n\t"
//use v8 to store alpha
"vfmv.v.f v8, %[ALPHA] \n\t"
"vle.v v0, (%[C0]) \n\t"
"addi t4, %[C0], 4*4 \n\t"
"vle.v v1, (%[C1]) \n\t"
"addi t5, %[C1], 4*4 \n\t"
"vle.v v2, (%[C2]) \n\t"
"addi t6, %[C2], 4*4 \n\t"
"vle.v v3, (%[C3]) \n\t"
"addi t3, %[C3], 4*4 \n\t"
//Multiply Alpha
"vfmacc.vv v0, v8, v16 \n\t"
"vle.v v4, (t4) \n\t"
"vfmacc.vv v1, v8, v20 \n\t"
"vle.v v5, (t5) \n\t"
"vfmacc.vv v2, v8, v24 \n\t"
"vle.v v6, (t6) \n\t"
"vfmacc.vv v3, v8, v28 \n\t"
"vle.v v7, (t3) \n\t"
"vfmacc.vv v4, v8, v17 \n\t"
"vse.v v0, (%[C0]) \n\t"
"add %[C0], %[C0], 8*4 \n\t"
"vfmacc.vv v5, v8, v21 \n\t"
"vse.v v1, (%[C1]) \n\t"
"add %[C1], %[C1], 8*4 \n\t"
"vfmacc.vv v6, v8, v25 \n\t"
"vse.v v2, (%[C2]) \n\t"
"add %[C2], %[C2], 8*4 \n\t"
"vfmacc.vv v7, v8, v29 \n\t"
"vse.v v3, (%[C3]) \n\t"
"add %[C3], %[C3], 8*4 \n\t"
"vle.v v0, (%[C0]) \n\t"
"vse.v v4, (t4) \n\t"
"add t4, t4, 8*4 \n\t"
"vle.v v1, (%[C1]) \n\t"
"vse.v v5, (t5) \n\t"
"add t5, t5, 8*4 \n\t"
"vle.v v2, (%[C2]) \n\t"
"vse.v v6, (t6) \n\t"
"add t6, t6, 8*4 \n\t"
"vle.v v3, (%[C3]) \n\t"
"vse.v v7, (t3) \n\t"
"add t3, t3, 8*4 \n\t"
"vfmacc.vv v0, v8, v18 \n\t"
"vle.v v4, (t4) \n\t"
"vfmacc.vv v1, v8, v22 \n\t"
"vle.v v5, (t5) \n\t"
"vfmacc.vv v2, v8, v26 \n\t"
"vle.v v6, (t6) \n\t"
"vfmacc.vv v3, v8, v30 \n\t"
"vle.v v7, (t3) \n\t"
"vfmacc.vv v4, v8, v19 \n\t"
"vse.v v0, (%[C0]) \n\t"
"add %[C0], %[C0], 8*4 \n\t"
"vfmacc.vv v5, v8, v23 \n\t"
"vse.v v1, (%[C1]) \n\t"
"add %[C1], %[C1], 8*4 \n\t"
"vfmacc.vv v6, v8, v27 \n\t"
"vse.v v2, (%[C2]) \n\t"
"add %[C2], %[C2], 8*4 \n\t"
"vfmacc.vv v7, v8, v31 \n\t"
"vse.v v3, (%[C3]) \n\t"
"add %[C3], %[C3], 8*4 \n\t"
"vse.v v4, (t4) \n\t"
"vse.v v5, (t5) \n\t"
"vse.v v6, (t6) \n\t"
"vse.v v7, (t3) \n\t"
"M16x4_END: \n\t"
:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
[PA]"+r"(ptrba), [PB]"+r"(ptrbb)
:[ALPHA]"f"(alpha), [BK]"r"(bk)
:"cc", "t0", "t4","t5","t6","t3","t1","t2",
"ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
"v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
if(bm&8){
ptrbb = bb;
//t0 for k
//ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
//v0-v3,v4-v7 for A, t4-t6 for PA1-3
//v16-v31 for temp C
asm volatile(
"vsetvli zero, zero, e32,m1 \n\t"
"fmv.w.x ft11, zero \n\t"
"mv t0, %[BK] \n\t"
"vfmv.v.f v16, ft11 \n\t"
"vfmv.v.f v17, ft11 \n\t"
"vfmv.v.f v20, ft11 \n\t"
"vfmv.v.f v21, ft11 \n\t"
"vfmv.v.f v24, ft11 \n\t"
"vfmv.v.f v25, ft11 \n\t"
"vfmv.v.f v28, ft11 \n\t"
"vfmv.v.f v29, ft11 \n\t"
//unloop 8
"srli t0, %[BK], 3 \n\t"
"blez t0, M8x4_TAIL \n\t"
//preloop
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
"addi t0, t0, -1 \n\t"
"blez t0, M8x4_MAINLOOP_TAIL \n\t"
".align 4 \n\t"
"M8x4_MAINLOOP: \n\t"
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
"addi t0, t0, -1 \n\t"
"bgtz t0, M8x4_MAINLOOP \n\t"
"M8x4_MAINLOOP_TAIL: \n\t"
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
//tail
"M8x4_TAIL: \n\t"
"andi t0, %[BK], 7 \n\t"
"blez t0, M8x4_SAVERESULT \n\t"
"addi t4, %[PA], 4*4 \n\t"
"addi t1, %[PB], 1*4 \n\t"
"addi t2, %[PB], 2*4 \n\t"
"addi t3, %[PB], 3*4 \n\t"
".align 4 \n\t"
"M8x4_TAILLOOP: \n\t"
"flw ft0, (%[PB]) \n\t"
"addi %[PB], %[PB], 4*4 \n\t"
"vle.v v0, (%[PA]) \n\t"
"add %[PA], %[PA], 8*4 \n\t"
"vle.v v1, (t4) \n\t"
"addi t4, t4, 8*4 \n\t"
"vfmv.v.f v8, ft0 \n\t"
"flw ft1, (t1) \n\t"
"addi t1, t1, 4*4 \n\t"
"vfmacc.vv v16, v8, v0 \n\t"
"flw ft2, (t2) \n\t"
"addi t2, t2, 4*4 \n\t"
"vfmacc.vv v17, v8, v1 \n\t"
"vfmv.v.f v9, ft1 \n\t"
"vfmacc.vv v20, v9, v0 \n\t"
"flw ft3, (t3) \n\t"
"addi t3, t3, 4*4 \n\t"
"vfmacc.vv v21, v9, v1 \n\t"
"vfmv.v.f v10, ft2 \n\t"
"vfmv.v.f v11, ft3 \n\t"
"vfmacc.vv v24, v10, v0 \n\t"
"vfmacc.vv v25, v10, v1 \n\t"
"vfmacc.vv v28, v11, v0 \n\t"
"vfmacc.vv v29, v11, v1 \n\t"
"addi t0, t0, -1 \n\t"
"bgtz t0, M8x4_TAILLOOP \n\t"
//Save result
//load C
"M8x4_SAVERESULT: \n\t"
//use v8 to store alpha
"vfmv.v.f v8, %[ALPHA] \n\t"
"vle.v v0, (%[C0]) \n\t"
"addi t4, %[C0], 4*4 \n\t"
"vle.v v1, (%[C1]) \n\t"
"addi t5, %[C1], 4*4 \n\t"
"vle.v v2, (%[C2]) \n\t"
"addi t6, %[C2], 4*4 \n\t"
"vle.v v3, (%[C3]) \n\t"
"addi t3, %[C3], 4*4 \n\t"
//Multiply Alpha
"vfmacc.vv v0, v8, v16 \n\t"
"vle.v v4, (t4) \n\t"
"vfmacc.vv v1, v8, v20 \n\t"
"vle.v v5, (t5) \n\t"
"vfmacc.vv v2, v8, v24 \n\t"
"vle.v v6, (t6) \n\t"
"vfmacc.vv v3, v8, v28 \n\t"
"vle.v v7, (t3) \n\t"
"vfmacc.vv v4, v8, v17 \n\t"
"vse.v v0, (%[C0]) \n\t"
"add %[C0], %[C0], 8*4 \n\t"
"vfmacc.vv v5, v8, v21 \n\t"
"vse.v v1, (%[C1]) \n\t"
"add %[C1], %[C1], 8*4 \n\t"
"vfmacc.vv v6, v8, v25 \n\t"
"vse.v v2, (%[C2]) \n\t"
"add %[C2], %[C2], 8*4 \n\t"
"vfmacc.vv v7, v8, v29 \n\t"
"vse.v v3, (%[C3]) \n\t"
"add %[C3], %[C3], 8*4 \n\t"
"vse.v v4, (t4) \n\t"
"vse.v v5, (t5) \n\t"
"vse.v v6, (t6) \n\t"
"vse.v v7, (t3) \n\t"
"M8x4_END: \n\t"
:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
[PA]"+r"(ptrba), [PB]"+r"(ptrbb)
:[ALPHA]"f"(alpha), [BK]"r"(bk)
:"cc", "t0", "t4","t5","t6","t3","t1","t2",
"ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
"v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
"v16", "v17", "v20", "v21",
"v24", "v25", "v28", "v29");
}
if(bm&4){
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
res8 = 0;
res9 = 0;
res10 = 0;
res11 = 0;
res12 = 0;
res13 = 0;
res14 = 0;
res15 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
loadb1 = ptrbb[1];
load0 = ptrba[0];
load1 = ptrba[1];
load2 = ptrba[2];
load3 = ptrba[3];
res0 = res0 + load0 * loadb0;
res1 = res1 + load1 * loadb0;
res2 = res2 + load2 * loadb0;
res3 = res3 + load3 * loadb0;
res4 = res4 + load0 * loadb1;
res5 = res5 + load1 * loadb1;
res6 = res6 + load2 * loadb1;
res7 = res7 + load3 * loadb1;
loadb2 = ptrbb[2];
loadb3 = ptrbb[3];
res8 = res8 + load0 * loadb2;
res9 = res9 + load1 * loadb2;
res10 = res10 + load2 * loadb2;
res11 = res11 + load3 * loadb2;
res12 = res12 + load0 * loadb3;
res13 = res13 + load1 * loadb3;
res14 = res14 + load2 * loadb3;
res15 = res15 + load3 * loadb3;
ptrba += 4;
ptrbb += 4;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res2 = res2 * alpha;
res3 = res3 * alpha;
res4 = res4 * alpha;
res5 = res5 * alpha;
res6 = res6 * alpha;
res7 = res7 * alpha;
res8 = res8 * alpha;
res9 = res9 * alpha;
res10 = res10 * alpha;
res11 = res11 * alpha;
res12 = res12 * alpha;
res13 = res13 * alpha;
res14 = res14 * alpha;
res15 = res15 * alpha;
C0[0] += res0;
C0[1] += res1;
C0[2] += res2;
C0[3] += res3;
C1[0] += res4;
C1[1] += res5;
C1[2] += res6;
C1[3] += res7;
C2[0] += res8;
C2[1] += res9;
C2[2] += res10;
C2[3] += res11;
C3[0] += res12;
C3[1] += res13;
C3[2] += res14;
C3[3] += res15;
C0 += 4;
C1 += 4;
C2 += 4;
C3 += 4;
}
if(bm&2){
ptrbb = bb;
res0 = 0;
res1 = 0;
res4 = 0;
res5 = 0;
res8 = 0;
res9 = 0;
res12 = 0;
res13 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
loadb1 = ptrbb[1];
load0 = ptrba[0];
load1 = ptrba[1];
res0 = res0 + load0 * loadb0;
res1 = res1 + load1 * loadb0;
res4 = res4 + load0 * loadb1;
res5 = res5 + load1 * loadb1;
loadb2 = ptrbb[2];
loadb3 = ptrbb[3];
res8 = res8 + load0 * loadb2;
res9 = res9 + load1 * loadb2;
res12 = res12 + load0 * loadb3;
res13 = res13 + load1 * loadb3;
ptrba += 2;
ptrbb += 4;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res4 = res4 * alpha;
res5 = res5 * alpha;
res8 = res8 * alpha;
res9 = res9 * alpha;
res12 = res12 * alpha;
res13 = res13 * alpha;
C0[0] += res0;
C0[1] += res1;
C1[0] += res4;
C1[1] += res5;
C2[0] += res8;
C2[1] += res9;
C3[0] += res12;
C3[1] += res13;
C0 += 2;
C1 += 2;
C2 += 2;
C3 += 2;
}
if(bm&1){
ptrbb = bb;
res0 = 0;
res4 = 0;
res8 = 0;
res12 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
loadb1 = ptrbb[1];
load0 = ptrba[0];
res0 = res0 + load0 * loadb0;
res4 = res4 + load0 * loadb1;
loadb2 = ptrbb[2];
loadb3 = ptrbb[3];
res8 = res8 + load0 * loadb2;
res12 = res12 + load0 * loadb3;
ptrba += 1;
ptrbb += 4;
}
res0 = res0 * alpha;
res4 = res4 * alpha;
res8 = res8 * alpha;
res12 = res12 * alpha;
C0[0] += res0;
C1[0] += res4;
C2[0] += res8;
C3[0] += res12;
C0 += 1;
C1 += 1;
C2 += 1;
C3 += 1;
}
k = bk<<2;
bb = bb+k;
i = ldc<<2;
C = C+i;
}
if(bn&2){
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for(i=0; i<bm/16; i+=1){
ptrbb = bb;
asm volatile(
"vsetvli zero, zero, e32,m1 \n\t"
"fmv.w.x ft11, zero \n\t"
"mv t0, %[BK] \n\t"
"vfmv.v.f v16, ft11 \n\t"
"vfmv.v.f v17, ft11 \n\t"
"vfmv.v.f v18, ft11 \n\t"
"vfmv.v.f v19, ft11 \n\t"
"vfmv.v.f v20, ft11 \n\t"
"vfmv.v.f v21, ft11 \n\t"
"vfmv.v.f v22, ft11 \n\t"
"vfmv.v.f v23, ft11 \n\t"
//unloop 8
"srli t0, %[BK], 3 \n\t"
"blez t0, M16x2_TAIL \n\t"
//preloop
KERNEL16x2_I
KERNEL16x2_M2
KERNEL16x2_M1
KERNEL16x2_M2
"addi t0, t0, -1 \n\t"
"blez t0, M16x2_MAINLOOP_TAIL \n\t"
".align 4 \n\t"
"M16x2_MAINLOOP: \n\t"
KERNEL16x2_M1
KERNEL16x2_M2
KERNEL16x2_M1
KERNEL16x2_M2
KERNEL16x2_M1
KERNEL16x2_M2
KERNEL16x2_M1
KERNEL16x2_M2
"addi t0, t0, -1 \n\t"
"bgtz t0, M16x2_MAINLOOP \n\t"
"M16x2_MAINLOOP_TAIL: \n\t"
KERNEL16x2_M1
KERNEL16x2_M2
KERNEL16x2_M1
KERNEL16x2_E
//tail
"M16x2_TAIL: \n\t"
"andi t0, %[BK], 7 \n\t"
"blez t0, M16x2_SAVERESULT \n\t"
"addi t4, %[PA], 4*4 \n\t"
"addi t5, %[PA], 8*4 \n\t"
"addi t6, %[PA], 12*4 \n\t"
"addi t1, %[PB], 1*4 \n\t"
".align 4 \n\t"
"M16x2_TAILLOOP: \n\t"
"flw ft0, (%[PB]) \n\t"
"addi %[PB], %[PB], 2*4 \n\t"
"vle.v v0, (%[PA]) \n\t"
"add %[PA], %[PA], 16*4 \n\t"
"vle.v v1, (t4) \n\t"
"addi t4, t4, 16*4 \n\t"
"vfmv.v.f v8, ft0 \n\t"
"flw ft1, (t1) \n\t"
"addi t1, t1, 2*4 \n\t"
"vle.v v2, (t5) \n\t"
"addi t5, t5, 16*4 \n\t"
"vle.v v3, (t6) \n\t"
"addi t6, t6, 16*4 \n\t"
"vfmv.v.f v9, ft1 \n\t"
"vfmacc.vv v16, v8, v0 \n\t"
"vfmacc.vv v17, v8, v1 \n\t"
"vfmacc.vv v18, v8, v2 \n\t"
"vfmacc.vv v19, v8, v3 \n\t"
"vfmacc.vv v20, v9, v0 \n\t"
"vfmacc.vv v21, v9, v1 \n\t"
"vfmacc.vv v22, v9, v2 \n\t"
"vfmacc.vv v23, v9, v3 \n\t"
"addi t0, t0, -1 \n\t"
"bgtz t0, M16x2_TAILLOOP \n\t"
//Save result
//load C
"M16x2_SAVERESULT: \n\t"
//use v8 to store alpha
"vfmv.v.f v8, %[ALPHA] \n\t"
"vle.v v0, (%[C0]) \n\t"
"addi t4, %[C0], 4*4 \n\t"
"vle.v v1, (%[C1]) \n\t"
"addi t5, %[C1], 4*4 \n\t"
//Multiply Alpha
"vfmacc.vv v0, v8, v16 \n\t"
"vle.v v4, (t4) \n\t"
"vfmacc.vv v1, v8, v20 \n\t"
"vle.v v5, (t5) \n\t"
"vfmacc.vv v4, v8, v17 \n\t"
"vse.v v0, (%[C0]) \n\t"
"add %[C0], %[C0], 8*4 \n\t"
"vfmacc.vv v5, v8, v21 \n\t"
"vse.v v1, (%[C1]) \n\t"
"add %[C1], %[C1], 8*4 \n\t"
"vle.v v0, (%[C0]) \n\t"
"vse.v v4, (t4) \n\t"
"add t4, t4, 8*4 \n\t"
"vle.v v1, (%[C1]) \n\t"
"vse.v v5, (t5) \n\t"
"add t5, t5, 8*4 \n\t"
"vfmacc.vv v0, v8, v18 \n\t"
"vle.v v4, (t4) \n\t"
"vfmacc.vv v1, v8, v22 \n\t"
"vle.v v5, (t5) \n\t"
"vfmacc.vv v4, v8, v19 \n\t"
"vse.v v0, (%[C0]) \n\t"
"add %[C0], %[C0], 8*4 \n\t"
"vfmacc.vv v5, v8, v23 \n\t"
"vse.v v1, (%[C1]) \n\t"
"add %[C1], %[C1], 8*4 \n\t"
"vse.v v4, (t4) \n\t"
"vse.v v5, (t5) \n\t"
"M16x2_END: \n\t"
:[C0]"+r"(C0),[C1]"+r"(C1),
[PA]"+r"(ptrba), [PB]"+r"(ptrbb)
:[ALPHA]"f"(alpha), [BK]"r"(bk)
:"cc", "t0", "t4","t5","t6","t3","t1","t2",
"ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
"v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
if(bm&8){
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
res8 = 0;
res9 = 0;
res10 = 0;
res11 = 0;
res12 = 0;
res13 = 0;
res14 = 0;
res15 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
loadb1 = ptrbb[1];
load0 = ptrba[0];
load1 = ptrba[1];
load2 = ptrba[2];
load3 = ptrba[3];
load4 = ptrba[4];
load5 = ptrba[5];
load6 = ptrba[6];
load7 = ptrba[7];
res0 = res0 + load0 * loadb0;
res1 = res1 + load1 * loadb0;
res2 = res2 + load2 * loadb0;
res3 = res3 + load3 * loadb0;
res4 = res4 + load4 * loadb0;
res5 = res5 + load5 * loadb0;
res6 = res6 + load6 * loadb0;
res7 = res7 + load7 * loadb0;
res8 = res8 + load0 * loadb1;
res9 = res9 + load1 * loadb1;
res10 = res10 + load2 * loadb1;
res11 = res11 + load3 * loadb1;
res12 = res12 + load4 * loadb1;
res13 = res13 + load5 * loadb1;
res14 = res14 + load6 * loadb1;
res15 = res15 + load7 * loadb1;
ptrba += 8;
ptrbb += 2;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res2 = res2 * alpha;
res3 = res3 * alpha;
res4 = res4 * alpha;
res5 = res5 * alpha;
res6 = res6 * alpha;
res7 = res7 * alpha;
res8 = res8 * alpha;
res9 = res9 * alpha;
res10 = res10 * alpha;
res11 = res11 * alpha;
res12 = res12 * alpha;
res13 = res13 * alpha;
res14 = res14 * alpha;
res15 = res15 * alpha;
C0[0] += res0;
C0[1] += res1;
C0[2] += res2;
C0[3] += res3;
C0[4] += res4;
C0[5] += res5;
C0[6] += res6;
C0[7] += res7;
C1[0] += res8;
C1[1] += res9;
C1[2] += res10;
C1[3] += res11;
C1[4] += res12;
C1[5] += res13;
C1[6] += res14;
C1[7] += res15;
C0 += 8;
C1 += 8;
}
if(bm&4){
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res8 = 0;
res9 = 0;
res10 = 0;
res11 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
loadb1 = ptrbb[1];
load0 = ptrba[0];
load1 = ptrba[1];
load2 = ptrba[2];
load3 = ptrba[3];
res0 = res0 + load0 * loadb0;
res1 = res1 + load1 * loadb0;
res2 = res2 + load2 * loadb0;
res3 = res3 + load3 * loadb0;
res8 = res8 + load0 * loadb1;
res9 = res9 + load1 * loadb1;
res10 = res10 + load2 * loadb1;
res11 = res11 + load3 * loadb1;
ptrba += 4;
ptrbb += 2;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res2 = res2 * alpha;
res3 = res3 * alpha;
res8 = res8 * alpha;
res9 = res9 * alpha;
res10 = res10 * alpha;
res11 = res11 * alpha;
C0[0] += res0;
C0[1] += res1;
C0[2] += res2;
C0[3] += res3;
C1[0] += res8;
C1[1] += res9;
C1[2] += res10;
C1[3] += res11;
C0 += 4;
C1 += 4;
}
if(bm&2){
ptrbb = bb;
res0 = 0;
res1 = 0;
res8 = 0;
res9 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
loadb1 = ptrbb[1];
load0 = ptrba[0];
load1 = ptrba[1];
res0 = res0 + load0 * loadb0;
res1 = res1 + load1 * loadb0;
res8 = res8 + load0 * loadb1;
res9 = res9 + load1 * loadb1;
ptrba += 2;
ptrbb += 2;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res8 = res8 * alpha;
res9 = res9 * alpha;
C0[0] += res0;
C0[1] += res1;
C1[0] += res8;
C1[1] += res9;
C0 += 2;
C1 += 2;
}
if(bm&1){
ptrbb = bb;
res0 = 0;
res8 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
loadb1 = ptrbb[1];
load0 = ptrba[0];
res0 = res0 + load0 * loadb0;
res8 = res8 + load0 * loadb1;
ptrba += 1;
ptrbb += 2;
}
res0 = res0 * alpha;
res8 = res8 * alpha;
C0[0] += res0;
C1[0] += res8;
C0 += 1;
C1 += 1;
}
k = bk<<1;
bb = bb+k;
i = ldc<<1;
C = C+i;
}
if (bn&1){
C0 = C;
ptrba = ba;
for(i=0; i<bm/16; i+=1){
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
res8 = 0;
res9 = 0;
res10 = 0;
res11 = 0;
res12 = 0;
res13 = 0;
res14 = 0;
res15 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
res0 = res0 + ptrba[0] * loadb0;
res1 = res1 + ptrba[1] * loadb0;
res2 = res2 + ptrba[2] * loadb0;
res3 = res3 + ptrba[3] * loadb0;
res4 = res4 + ptrba[4] * loadb0;
res5 = res5 + ptrba[5] * loadb0;
res6 = res6 + ptrba[6] * loadb0;
res7 = res7 + ptrba[7] * loadb0;
res8 = res8 + ptrba[8] * loadb0;
res9 = res9 + ptrba[9] * loadb0;
res10 = res10 + ptrba[10] * loadb0;
res11 = res11 + ptrba[11] * loadb0;
res12 = res12 + ptrba[12] * loadb0;
res13 = res13 + ptrba[13] * loadb0;
res14 = res14 + ptrba[14] * loadb0;
res15 = res15 + ptrba[15] * loadb0;
ptrba += 16;
ptrbb += 1;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res2 = res2 * alpha;
res3 = res3 * alpha;
res4 = res4 * alpha;
res5 = res5 * alpha;
res6 = res6 * alpha;
res7 = res7 * alpha;
res8 = res8 * alpha;
res9 = res9 * alpha;
res10 = res10 * alpha;
res11 = res11 * alpha;
res12 = res12 * alpha;
res13 = res13 * alpha;
res14 = res14 * alpha;
res15 = res15 * alpha;
C0[0] += res0;
C0[1] += res1;
C0[2] += res2;
C0[3] += res3;
C0[4] += res4;
C0[5] += res5;
C0[6] += res6;
C0[7] += res7;
C0[8] += res8;
C0[9] += res9;
C0[10] += res10;
C0[11] += res11;
C0[12] += res12;
C0[13] += res13;
C0[14] += res14;
C0[15] += res15;
C0 += 16;
}
if(bm&8){
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
res4 = 0;
res5 = 0;
res6 = 0;
res7 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
res0 = res0 + ptrba[0] * loadb0;
res1 = res1 + ptrba[1] * loadb0;
res2 = res2 + ptrba[2] * loadb0;
res3 = res3 + ptrba[3] * loadb0;
res4 = res4 + ptrba[4] * loadb0;
res5 = res5 + ptrba[5] * loadb0;
res6 = res6 + ptrba[6] * loadb0;
res7 = res7 + ptrba[7] * loadb0;
ptrba += 8;
ptrbb += 1;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res2 = res2 * alpha;
res3 = res3 * alpha;
res4 = res4 * alpha;
res5 = res5 * alpha;
res6 = res6 * alpha;
res7 = res7 * alpha;
C0[0] += res0;
C0[1] += res1;
C0[2] += res2;
C0[3] += res3;
C0[4] += res4;
C0[5] += res5;
C0[6] += res6;
C0[7] += res7;
C0 += 8;
}
if(bm&4){
ptrbb = bb;
res0 = 0;
res1 = 0;
res2 = 0;
res3 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
res0 = res0 + ptrba[0] * loadb0;
res1 = res1 + ptrba[1] * loadb0;
res2 = res2 + ptrba[2] * loadb0;
res3 = res3 + ptrba[3] * loadb0;
ptrba += 4;
ptrbb += 1;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
res2 = res2 * alpha;
res3 = res3 * alpha;
C0[0] += res0;
C0[1] += res1;
C0[2] += res2;
C0[3] += res3;
C0 += 4;
}
if(bm&2){
ptrbb = bb;
res0 = 0;
res1 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
res0 = res0 + ptrba[0] * loadb0;
res1 = res1 + ptrba[1] * loadb0;
ptrba += 2;
ptrbb += 1;
}
res0 = res0 * alpha;
res1 = res1 * alpha;
C0[0] += res0;
C0[1] += res1;
C0 += 2;
}
if(bm&1){
ptrbb = bb;
res0 = 0;
for(k=0; k<bk; k+=1){
loadb0 = ptrbb[0];
res0 = res0 + ptrba[0] * loadb0;
ptrba += 1;
ptrbb += 1;
}
res0 = res0 * alpha;
C0[0] += res0;
C0 += 1;
}
k = bk;
bb = bb+k;
C = C+ldc;
}
return 0;
}