#include "common.h" #include #define KERNEL16x4_I \ "addi t1, %[PB], 1*4 \n\t"\ "addi t2, %[PB], 2*4 \n\t"\ "addi t3, %[PB], 3*4 \n\t"\ "flw ft0, (%[PB]) \n\t"\ "flw ft1, (t1) \n\t"\ "flw ft2, (t2) \n\t"\ "flw ft3, (t3) \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi t4, %[PA], 4*4 \n\t"\ "addi t5, %[PA], 8*4 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "addi t6, %[PA], 12*4 \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmv.v.f v9, ft1 \n\t"\ "vle.v v2, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vle.v v3, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "vfmv.v.f v10, ft2 \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vfmv.v.f v11, ft3 \n\t"\ "vfmacc.vv v16, v8, v0 \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "vle.v v6, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vfmacc.vv v18, v8, v2 \n\t"\ "addi t3, t3, 4*4 \n\t"\ "vle.v v7, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "vfmacc.vv v19, v8, v3 \n\t"\ "flw ft4, (%[PB]) \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "flw ft5, (t1) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "flw ft6, (t2) \n\t"\ "vfmacc.vv v22, v9, v2 \n\t"\ "flw ft7, (t3) \n\t"\ "vfmacc.vv v23, v9, v3 \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "vfmacc.vv v24, v10, v0 \n\t"\ "vfmv.v.f v13, ft5 \n\t"\ "vfmacc.vv v25, v10, v1 \n\t"\ "vfmv.v.f v14, ft6 \n\t"\ "vfmacc.vv v26, v10, v2 \n\t"\ "vfmv.v.f v15, ft7 \n\t"\ "vfmacc.vv v27, v10, v3 \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "vfmacc.vv v28, v11, v0 \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vfmacc.vv v29, v11, v1 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "vfmacc.vv v30, v11, v2 \n\t"\ "addi t3, t3, 4*4 \n\t"\ "vfmacc.vv v31, v11, v3 \n\t" #define KERNEL16x4_M1 \ "vfmacc.vv v16, v8, v0 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmacc.vv v18, v8, v2 \n\t"\ "vle.v v6, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vfmacc.vv v19, v8, v3 \n\t"\ "vle.v v7, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "flw ft4, (%[PB]) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "flw ft5, (t1) \n\t"\ "vfmacc.vv v22, v9, v2 \n\t"\ "flw ft6, (t2) \n\t"\ "vfmacc.vv v23, v9, v3 \n\t"\ "flw ft7, (t3) \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "vfmacc.vv v24, v10, v0 \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vfmacc.vv v25, v10, v1 \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "vfmacc.vv v26, v10, v2 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "vfmacc.vv v27, v10, v3 \n\t"\ "vfmv.v.f v13, ft5 \n\t"\ "vfmacc.vv v28, v11, v0 \n\t"\ "addi t3, t3, 4*4 \n\t"\ "vfmacc.vv v29, v11, v1 \n\t"\ "vfmv.v.f v14, ft6 \n\t"\ "vfmacc.vv v30, v11, v2 \n\t"\ "vfmacc.vv v31, v11, v3 \n\t"\ "vfmv.v.f v15, ft7 \n\t" #define KERNEL16x4_M2 \ "vfmacc.vv v16, v12, v4 \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmacc.vv v18, v12, v6 \n\t"\ "vle.v v2, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vfmacc.vv v19, v12, v7 \n\t"\ "vle.v v3, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "flw ft0, (%[PB]) \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "flw ft1, (t1) \n\t"\ "vfmacc.vv v22, v13, v6 \n\t"\ "flw ft2, (t2) \n\t"\ "vfmacc.vv v23, v13, v7 \n\t"\ "flw ft3, (t3) \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "vfmacc.vv v24, v14, v4 \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vfmacc.vv v25, v14, v5 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "vfmacc.vv v26, v14, v6 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "vfmacc.vv v27, v14, v7 \n\t"\ "vfmv.v.f v9, ft1 \n\t"\ "vfmacc.vv v28, v15, v4 \n\t"\ "addi t3, t3, 4*4 \n\t"\ "vfmacc.vv v29, v15, v5 \n\t"\ "vfmv.v.f v10, ft2 \n\t"\ "vfmacc.vv v30, v15, v6 \n\t"\ "vfmacc.vv v31, v15, v7 \n\t"\ "vfmv.v.f v11, ft3 \n\t" #define KERNEL16x4_E \ "vfmacc.vv v16, v12, v4 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vfmacc.vv v18, v12, v6 \n\t"\ "vfmacc.vv v19, v12, v7 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "vfmacc.vv v22, v13, v6 \n\t"\ "vfmacc.vv v23, v13, v7 \n\t"\ "vfmacc.vv v24, v14, v4 \n\t"\ "vfmacc.vv v25, v14, v5 \n\t"\ "vfmacc.vv v26, v14, v6 \n\t"\ "vfmacc.vv v27, v14, v7 \n\t"\ "vfmacc.vv v28, v15, v4 \n\t"\ "vfmacc.vv v29, v15, v5 \n\t"\ "vfmacc.vv v30, v15, v6 \n\t"\ "vfmacc.vv v31, v15, v7 \n\t" #define KERNEL8x4_I \ "addi t1, %[PB], 1*4 \n\t"\ "addi t2, %[PB], 2*4 \n\t"\ "addi t3, %[PB], 3*4 \n\t"\ "flw ft0, (%[PB]) \n\t"\ "flw ft1, (t1) \n\t"\ "flw ft2, (t2) \n\t"\ "flw ft3, (t3) \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi t4, %[PA], 4*4 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "addi %[PA], %[PA], 8*4 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 8*4 \n\t"\ "vfmv.v.f v9, ft1 \n\t"\ "vfmv.v.f v10, ft2 \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 8*4 \n\t"\ "vfmv.v.f v11, ft3 \n\t"\ "vfmacc.vv v16, v8, v0 \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 8*4 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "flw ft4, (%[PB]) \n\t"\ "addi t3, t3, 4*4 \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "flw ft5, (t1) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "flw ft6, (t2) \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "flw ft7, (t3) \n\t"\ "vfmacc.vv v24, v10, v0 \n\t"\ "vfmv.v.f v13, ft5 \n\t"\ "vfmacc.vv v25, v10, v1 \n\t"\ "vfmv.v.f v14, ft6 \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "vfmv.v.f v15, ft7 \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vfmacc.vv v28, v11, v0 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "vfmacc.vv v29, v11, v1 \n\t"\ "addi t3, t3, 4*4 \n\t" #define KERNEL8x4_M1 \ "vfmacc.vv v16, v8, v0 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 8*4 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 8*4 \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "flw ft4, (%[PB]) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "flw ft5, (t1) \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "flw ft6, (t2) \n\t"\ "vfmacc.vv v24, v10, v0 \n\t"\ "flw ft7, (t3) \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vfmacc.vv v25, v10, v1 \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "vfmv.v.f v13, ft5 \n\t"\ "vfmacc.vv v28, v11, v0 \n\t"\ "addi t3, t3, 4*4 \n\t"\ "vfmacc.vv v29, v11, v1 \n\t"\ "vfmv.v.f v14, ft6 \n\t"\ "vfmv.v.f v15, ft7 \n\t" #define KERNEL8x4_M2 \ "vfmacc.vv v16, v12, v4 \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi %[PA], %[PA], 8*4 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 8*4 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "flw ft0, (%[PB]) \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "flw ft1, (t1) \n\t"\ "addi %[PB], %[PB], 4*4 \n\t"\ "flw ft2, (t2) \n\t"\ "vfmacc.vv v24, v14, v4 \n\t"\ "flw ft3, (t3) \n\t"\ "addi t1, t1, 4*4 \n\t"\ "vfmacc.vv v25, v14, v5 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "addi t2, t2, 4*4 \n\t"\ "vfmv.v.f v9, ft1 \n\t"\ "vfmacc.vv v28, v15, v4 \n\t"\ "addi t3, t3, 4*4 \n\t"\ "vfmacc.vv v29, v15, v5 \n\t"\ "vfmv.v.f v10, ft2 \n\t"\ "vfmv.v.f v11, ft3 \n\t" #define KERNEL8x4_E \ "vfmacc.vv v16, v12, v4 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "vfmacc.vv v24, v14, v4 \n\t"\ "vfmacc.vv v25, v14, v5 \n\t"\ "vfmacc.vv v28, v15, v4 \n\t"\ "vfmacc.vv v29, v15, v5 \n\t" #define KERNEL16x2_I \ "addi t1, %[PB], 1*4 \n\t"\ "flw ft0, (%[PB]) \n\t"\ "flw ft1, (t1) \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi t4, %[PA], 4*4 \n\t"\ "addi t5, %[PA], 8*4 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "addi t6, %[PA], 12*4 \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmv.v.f v9, ft1 \n\t"\ "vle.v v2, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vle.v v3, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "addi %[PB], %[PB], 2*4 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vfmacc.vv v16, v8, v0 \n\t"\ "addi t1, t1, 2*4 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "vle.v v6, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vfmacc.vv v18, v8, v2 \n\t"\ "vle.v v7, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "vfmacc.vv v19, v8, v3 \n\t"\ "flw ft4, (%[PB]) \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "flw ft5, (t1) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "addi %[PB], %[PB], 2*4 \n\t"\ "vfmacc.vv v22, v9, v2 \n\t"\ "addi t1, t1, 2*4 \n\t"\ "vfmacc.vv v23, v9, v3 \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "vfmv.v.f v13, ft5 \n\t" #define KERNEL16x2_M1 \ "vfmacc.vv v16, v8, v0 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmacc.vv v18, v8, v2 \n\t"\ "vle.v v6, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vfmacc.vv v19, v8, v3 \n\t"\ "vle.v v7, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "flw ft4, (%[PB]) \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "flw ft5, (t1) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "vfmacc.vv v22, v9, v2 \n\t"\ "addi t1, t1, 2*4 \n\t"\ "vfmacc.vv v23, v9, v3 \n\t"\ "addi %[PB], %[PB], 2*4 \n\t"\ "vfmv.v.f v13, ft5 \n\t" #define KERNEL16x2_M2 \ "vfmacc.vv v16, v12, v4 \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi %[PA], %[PA], 16*4 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 16*4 \n\t"\ "vfmacc.vv v18, v12, v6 \n\t"\ "vle.v v2, (t5) \n\t"\ "addi t5, t5, 16*4 \n\t"\ "vfmacc.vv v19, v12, v7 \n\t"\ "vle.v v3, (t6) \n\t"\ "addi t6, t6, 16*4 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "flw ft0, (%[PB]) \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "flw ft1, (t1) \n\t"\ "vfmacc.vv v22, v13, v6 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "vfmacc.vv v23, v13, v7 \n\t"\ "addi %[PB], %[PB], 2*4 \n\t"\ "addi t1, t1, 2*4 \n\t"\ "vfmv.v.f v9, ft1 \n\t" #define KERNEL16x2_E \ "vfmacc.vv v16, v12, v4 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vfmacc.vv v18, v12, v6 \n\t"\ "vfmacc.vv v19, v12, v7 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "vfmacc.vv v22, v13, v6 \n\t"\ "vfmacc.vv v23, v13, v7 \n\t" int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc #ifdef TRMMKERNEL ,BLASLONG offset #endif ) { BLASLONG i,j,k; FLOAT *C0,*C1,*C2,*C3; FLOAT *ptrba,*ptrbb, *tmpc; FLOAT loadb0,loadb1,loadb2,loadb3; FLOAT load0,load1,load2,load3,load4,load5,load6,load7; FLOAT res0,res1,res2,res3; FLOAT res4,res5,res6,res7; FLOAT res8,res9,res10,res11; FLOAT res12,res13,res14,res15; for (j=0; j