initial strmm(sgemm). not tuned yet

This commit is contained in:
Abdurrauf 2017-03-06 04:27:40 +04:00
parent 411982715c
commit 82e80fa82b
6 changed files with 3776 additions and 11 deletions

View File

@ -80,16 +80,20 @@ DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ztrmm4x4V.S
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMKERNEL = strmm8x4V.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o

View File

@ -0,0 +1,903 @@
/**********************************Zero Vectors**************************************************/
.macro ZERO_CVEC_8x4
vzero %v16
vzero %v17
vzero %v18
vzero %v19
vzero %v20
vzero %v21
vzero %v22
vzero %v23
vzero %v24
vzero %v25
vzero %v26
vzero %v27
vzero %v28
vzero %v29
vzero %v30
vzero %v31
.endm
.macro ZERO_CVEC_8x2
vzero %v16
vzero %v17
vzero %v18
vzero %v19
vzero %v20
vzero %v21
vzero %v22
vzero %v23
.endm
.macro ZERO_CVEC_8x1
vzero %v16
vzero %v17
vzero %v18
vzero %v19
.endm
.macro ZERO_CVEC_4x4
vzero %v16
vzero %v17
vzero %v20
vzero %v21
vzero %v24
vzero %v25
vzero %v28
vzero %v29
.endm
.macro ZERO_CVEC_4x2
vzero %v16
vzero %v17
vzero %v20
vzero %v21
.endm
.macro ZERO_CVEC_4x1
vzero %v16
vzero %v17
.endm
.macro ZERO_CVEC_2x4
vzero %v16
vzero %v17
vzero %v20
vzero %v21
.endm
.macro ZERO_CVEC_2x2
vzero %v16
vzero %v20
.endm
.macro ZERO_CVEC_2x1
vzero %v16
.endm
.macro ZERO_CVEC_1x4
vzero %v16
vzero %v17
.endm
.macro ZERO_CVEC_1x2
vzero %v16
.endm
.macro ZERO_CVEC_1x1
LZDR %f1
.endm
/***********************************Helper Calculations*************************************/
#define unit_size 8
#define DISP(ind,stride,disp) (ind*stride+disp)
#define DISP8(ind,disp) (ind*unit_size*8+disp)
#define DISP4(ind,disp) (ind*unit_size*4+disp)
#define DISP2(ind,disp) (ind*unit_size*2+disp)
#define DISP1(ind,disp) (ind*unit_size+disp)
#define N8 (8*unit_size)
#define N4 (4*unit_size)
#define N2 (2*unit_size)
#define N1 (1*unit_size)
.macro Calculate_8x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP4(\Index ,0)(\PTR_B_REG)
vlrepg %v1, DISP4(\Index ,8)(\PTR_B_REG)
vl %v2, DISP8(\Index , 0)(\PTR_A_REG)
vl %v3, DISP8(\Index ,16)(\PTR_A_REG)
vl %v4, DISP8(\Index ,32)(\PTR_A_REG)
vl %v5, DISP8(\Index ,48)(\PTR_A_REG)
vfmadb %v16,%v2,%v7,%v16
vfmadb %v17,%v3,%v7,%v17
vfmadb %v18,%v4,%v7,%v18
vfmadb %v19,%v5,%v7,%v19
vfmadb %v20,%v2,%v1,%v20
vfmadb %v21,%v3,%v1,%v21
vfmadb %v22,%v4,%v1,%v22
vfmadb %v23,%v5,%v1,%v23
vlrepg %v7, DISP4(\Index ,16)(\PTR_B_REG)
vlrepg %v1, DISP4(\Index ,24)(\PTR_B_REG)
.if \IsLast==1
la \PTR_A_REG, DISP8(\Index ,64)(\PTR_A_REG)
.endif
vfmadb %v24,%v2,%v7,%v24
vfmadb %v25,%v3,%v7,%v25
vfmadb %v26,%v4,%v7,%v26
vfmadb %v27,%v5,%v7,%v27
vfmadb %v28,%v2,%v1,%v28
vfmadb %v29,%v3,%v1,%v29
vfmadb %v30,%v4,%v1,%v30
vfmadb %v31,%v5,%v1,%v31
.if \IsLast==1
la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG)
.endif
.endm
.macro Calculate_8x2_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP2(\Index ,0)(\PTR_B_REG)
vlrepg %v1, DISP2(\Index ,8)(\PTR_B_REG)
vl %v2, DISP8(\Index ,0)(\PTR_A_REG)
vl %v3, DISP8(\Index ,16)(\PTR_A_REG)
vl %v4, DISP8(\Index ,32)(\PTR_A_REG)
vl %v5, DISP8(\Index ,48)(\PTR_A_REG)
vfmadb %v16,%v2,%v7,%v16
vfmadb %v17,%v3,%v7,%v17
vfmadb %v18,%v4,%v7,%v18
vfmadb %v19,%v5,%v7,%v19
vfmadb %v20,%v2,%v1,%v20
vfmadb %v21,%v3,%v1,%v21
.if \IsLast==1
la \PTR_A_REG, DISP8(\Index ,64)(\PTR_A_REG)
.endif
vfmadb %v22,%v4,%v1,%v22
vfmadb %v23,%v5,%v1,%v23
.if \IsLast==1
la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG)
.endif
.endm
.macro Calculate_8x1_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP1(\Index ,0)(\PTR_B_REG)
vl %v2, DISP8(\Index ,0)(\PTR_A_REG)
vl %v3, DISP8(\Index ,16)(\PTR_A_REG)
vl %v4, DISP8(\Index ,32)(\PTR_A_REG)
vl %v5, DISP8(\Index ,48)(\PTR_A_REG)
vfmadb %v16,%v2,%v7,%v16
.if \IsLast==1
la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG)
.endif
vfmadb %v17,%v3,%v7,%v17
vfmadb %v18,%v4,%v7,%v18
vfmadb %v19,%v5,%v7,%v19
.if \IsLast==1
la \PTR_A_REG, DISP8(\Index ,64)(\PTR_A_REG)
.endif
.endm
.macro Calculate_4x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP4(\Index ,0)(\PTR_B_REG)
vlrepg %v1, DISP4(\Index ,8)(\PTR_B_REG)
vl %v2, DISP4(\Index ,0)(\PTR_A_REG)
vl %v3, DISP4(\Index ,16)(\PTR_A_REG)
vfmadb %v16,%v2,%v7,%v16
vfmadb %v17,%v3,%v7,%v17
vfmadb %v20,%v2,%v1,%v20
vfmadb %v21,%v3,%v1,%v21
vlrepg %v7, DISP4(\Index ,16)(\PTR_B_REG)
vlrepg %v1, DISP4(\Index ,24)(\PTR_B_REG)
.if \IsLast==1
la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG)
.endif
vfmadb %v24,%v2,%v7,%v24
vfmadb %v25,%v3,%v7,%v25
vfmadb %v28,%v2,%v1,%v28
vfmadb %v29,%v3,%v1,%v29
.if \IsLast==1
la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG)
.endif
.endm
.macro Calculate_4x2_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP2(\Index ,0)(\PTR_B_REG)
vlrepg %v1, DISP2(\Index ,8)(\PTR_B_REG)
vl %v2, DISP4(\Index ,0)(\PTR_A_REG)
vl %v3, DISP4(\Index ,16)(\PTR_A_REG)
vfmadb %v16,%v2,%v7,%v16
vfmadb %v17,%v3,%v7,%v17
.if \IsLast==1
la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG)
.endif
vfmadb %v20,%v2,%v1,%v20
vfmadb %v21,%v3,%v1,%v21
.if \IsLast==1
la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG)
.endif
.endm
.macro Calculate_4x1_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP1(\Index ,0)(\PTR_B_REG)
vl %v2, DISP4(\Index ,0)(\PTR_A_REG)
vl %v3, DISP4(\Index ,16)(\PTR_A_REG)
.if \IsLast==1
la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG)
.endif
vfmadb %v16,%v2,%v7,%v16
vfmadb %v17,%v3,%v7,%v17
.if \IsLast==1
la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG)
.endif
.endm
.macro Calculate_2x2_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP2(\Index ,0)(\PTR_B_REG)
vlrepg %v1, DISP2(\Index ,8)(\PTR_B_REG)
vl %v2, DISP2(\Index ,0)(\PTR_A_REG)
vfmadb %v16,%v2,%v7,%v16
.if \IsLast==1
la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG)
.endif
vfmadb %v20,%v2,%v1,%v20
.if \IsLast==1
la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG)
.endif
.endm
.macro Calculate_2x1_I PTR_A_REG,PTR_B_REG,Index,IsLast
vlrepg %v7, DISP1(\Index ,0)(\PTR_B_REG)
vl %v2, DISP2(\Index ,0)(\PTR_A_REG)
.if \IsLast==1
la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG)
.endif
vfmadb %v16,%v2,%v7,%v16
.if \IsLast==1
la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG)
.endif
.endm
.macro Calculate_1x1_I PTR_A_REG,PTR_B_REG,Index,IsLast
ld %f2,DISP1(\Index ,0)(\PTR_A_REG) /**a*/
.if \IsLast==1
la \PTR_A_REG,DISP1(\Index ,8)(\PTR_A_REG)
.endif
madb %f1,%f2,DISP1(\Index ,0)(\PTR_B_REG)
.if \IsLast==1
la \PTR_B_REG,DISP1(\Index ,8)(\PTR_B_REG)
.endif
.endm
.macro CALC_8x4 PTR_A_REG,PTR_B_REG
Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_8x4_4 PTR_A_REG,PTR_B_REG
Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_8x2 PTR_A_REG,PTR_B_REG
Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_8x2_4 PTR_A_REG,PTR_B_REG
Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_8x1 PTR_A_REG,PTR_B_REG
Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_8x1_4 PTR_A_REG,PTR_B_REG
Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_4x4 PTR_A_REG,PTR_B_REG
Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_4x4_4 PTR_A_REG,PTR_B_REG
Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_4x2 PTR_A_REG,PTR_B_REG
Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_4x2_4 PTR_A_REG,PTR_B_REG
Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_4x1 PTR_A_REG,PTR_B_REG
Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_4x1_4 PTR_A_REG,PTR_B_REG
Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_2x4 PTR_A_REG,PTR_B_REG
Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,0,1
.endm
.macro CALC_2x4_4 PTR_A_REG,PTR_B_REG
Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,0,0
Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,1,0
Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,2,0
Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,3,1
.endm
.macro CALC_2x2 PTR_A_REG,PTR_B_REG
Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_2x2_4 PTR_A_REG,PTR_B_REG
Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_2x1 PTR_A_REG,PTR_B_REG
Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_2x1_4 PTR_A_REG,PTR_B_REG
Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
.macro CALC_1x4 PTR_A_REG,PTR_B_REG
Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,0,1
.endm
.macro CALC_1x4_4 PTR_A_REG,PTR_B_REG
Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,0,0
Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,1,0
Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,2,0
Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,3,1
.endm
.macro CALC_1x2 PTR_A_REG,PTR_B_REG
Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,0,1
.endm
.macro CALC_1x2_4 PTR_A_REG,PTR_B_REG
Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,0,0
Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,1,0
Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,2,0
Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,3,1
.endm
.macro CALC_1x1 PTR_A_REG,PTR_B_REG
Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,0,1
.endm
.macro CALC_1x1_4 PTR_A_REG,PTR_B_REG
Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,0,0
Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,1,0
Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,2,0
Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,3,1
.endm
/**************************************STORAGE*************************************************/
.macro Multiply_8x1 vr1,vr2,vr3,vr4,va1,va2,va3,va4,vb1
#if defined(TRMMKERNEL)
vfmdb \vr1,\va1,\vb1
vfmdb \vr2,\va2,\vb1
vfmdb \vr3,\va3,\vb1
vfmdb \vr4,\va4,\vb1
#else
vfmadb \vr1,\va1,\vb1,\vr1
vfmadb \vr2,\va2,\vb1,\vr2
vfmadb \vr3,\va3,\vb1,\vr3
vfmadb \vr4,\va4,\vb1,\vr4
#endif
.endm
.macro Multiply_4x1 vr1,vr2, va1,va2, vb1
#if defined(TRMMKERNEL)
vfmdb \vr1,\va1,\vb1
vfmdb \vr2,\va2,\vb1
#else
vfmadb \vr1,\va1,\vb1,\vr1
vfmadb \vr2,\va2,\vb1,\vr2
#endif
.endm
.macro Multiply_2x1 vr1, va1,vb1
#if defined(TRMMKERNEL)
vfmdb \vr1,\va1,\vb1
#else
vfmadb \vr1,\va1,\vb1,\vr1
#endif
.endm
.macro STORE_8x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL , LV1 ,LV2
la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
vl %v2,16(\CIJ_REG)
vl %v3,32(\CIJ_REG)
vl %v4,48(\CIJ_REG)
#endif
Multiply_8x1 %v1,%v2,%v3,%v4, %v16,%v17,%v18,%v19 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
vst %v2,16(\CIJ_REG)
vst %v3,32(\CIJ_REG)
vst %v4,48(\CIJ_REG)
la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL )
#if !defined(TRMMKERNEL)
vl %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v18,32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v19,48(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
Multiply_8x1 %v16,%v17,%v18,%v19, %v20,%v21,%v22,%v23 ,\ALPHA_VECREG
vst %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v18,32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v19,48(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG,\LV1)
vl %v2,16(\CIJ_REG,\LV1)
vl %v3,32(\CIJ_REG,\LV1)
vl %v4,48(\CIJ_REG,\LV1)
#endif
Multiply_8x1 %v1,%v2,%v3,%v4, %v24,%v25,%v26,%v27 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG,\LV1)
vst %v2,16(\CIJ_REG,\LV1)
vst %v3,32(\CIJ_REG,\LV1)
vst %v4,48(\CIJ_REG,\LV1)
#if !defined(TRMMKERNEL)
vl %v16,0(\CIJ_REG,\LV2)
vl %v17,16(\CIJ_REG,\LV2)
vl %v18,32(\CIJ_REG,\LV2)
vl %v19,48(\CIJ_REG,\LV2)
#endif
Multiply_8x1 %v16,%v17,%v18,%v19, %v28,%v29,%v30,%v31 ,\ALPHA_VECREG
vst %v16,0(\CIJ_REG,\LV2)
vst %v17,16(\CIJ_REG,\LV2)
vst %v18,32(\CIJ_REG,\LV2)
vst %v19,48(\CIJ_REG,\LV2)
la \CIJ_REG,64(\CIJ_REG)
.endm
.macro STORE_8x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
vl %v2,16(\CIJ_REG)
vl %v3,32(\CIJ_REG)
vl %v4,48(\CIJ_REG)
#endif
Multiply_8x1 %v1,%v2,%v3,%v4, %v16,%v17,%v18,%v19 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
vst %v2,16(\CIJ_REG)
vst %v3,32(\CIJ_REG)
vst %v4,48(\CIJ_REG)
#if !defined(TRMMKERNEL)
vl %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v18,32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v19,48(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
Multiply_8x1 %v16,%v17,%v18,%v19, %v20,%v21,%v22,%v23 ,\ALPHA_VECREG
vst %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v18,32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v19,48(\CIJ_REG,\LDC_BYTE_ORIGINAL)
la \CIJ_REG,64(\CIJ_REG)
.endm
.macro STORE_8x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
vl %v2,16(\CIJ_REG)
vl %v3,32(\CIJ_REG)
vl %v4,48(\CIJ_REG)
#endif
Multiply_8x1 %v1,%v2,%v3,%v4, %v16,%v17,%v18,%v19 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
vst %v2,16(\CIJ_REG)
vst %v3,32(\CIJ_REG)
vst %v4,48(\CIJ_REG)
la \CIJ_REG,64(\CIJ_REG)
.endm
.macro STORE_4x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL, LV1 ,LV2
la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
vl %v2,16(\CIJ_REG)
#endif
Multiply_4x1 %v1,%v2 , %v16,%v17 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
vst %v2,16(\CIJ_REG)
la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL )
#if !defined(TRMMKERNEL)
vl %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
Multiply_4x1 %v16,%v17 , %v20,%v21 ,\ALPHA_VECREG
vst %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG,\LV1)
vl %v2,16(\CIJ_REG,\LV1)
#endif
Multiply_4x1 %v1,%v2 , %v24,%v25 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG,\LV1)
vst %v2,16(\CIJ_REG,\LV1)
#if !defined(TRMMKERNEL)
vl %v16,0(\CIJ_REG,\LV2)
vl %v17,16(\CIJ_REG,\LV2)
#endif
Multiply_4x1 %v16,%v17, %v28,%v29 ,\ALPHA_VECREG
vst %v16,0(\CIJ_REG,\LV2)
vst %v17,16(\CIJ_REG,\LV2)
la \CIJ_REG,32(\CIJ_REG)
.endm
.macro STORE_4x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
vl %v2,16(\CIJ_REG)
#endif
Multiply_4x1 %v1,%v2 , %v16,%v17 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
vst %v2,16(\CIJ_REG)
#if !defined(TRMMKERNEL)
vl %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vl %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
Multiply_4x1 %v16,%v17 , %v20,%v21 ,\ALPHA_VECREG
vst %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
vst %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
la \CIJ_REG,32(\CIJ_REG)
.endm
.macro STORE_4x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
vl %v2,16(\CIJ_REG)
#endif
Multiply_4x1 %v1,%v2 , %v16,%v17 ,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
vst %v2,16(\CIJ_REG)
la \CIJ_REG,32(\CIJ_REG)
.endm
.macro STORE_2x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
#endif
Multiply_2x1 %v1,%v16,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
#if !defined(TRMMKERNEL)
vl %v2,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
Multiply_2x1 %v2,%v20,\ALPHA_VECREG
vst %v2,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
la \CIJ_REG,16(\CIJ_REG)
.endm
.macro STORE_2x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL
#if !defined(TRMMKERNEL)
vl %v1,0(\CIJ_REG)
#endif
Multiply_2x1 %v1,%v16,\ALPHA_VECREG
vst %v1,0(\CIJ_REG)
la \CIJ_REG,16(\CIJ_REG)
.endm
/*STORE C1X1*/
.macro STORE_1x1 ALPHA_FLOAT,CIJ_REG,LDC_BYTE_ORIGINAL
#if defined(TRMMKERNEL)
mdbr %f1,\ALPHA_FLOAT
std %f1,0(CIJ_LOCAL)
#else
ld %f2,0(CIJ_LOCAL)
madbr %f2,%f1,\ALPHA_FLOAT
std %f2,0(CIJ_LOCAL)
#endif
la \CIJ_REG,8(\CIJ_REG)
.endm
/*reversed ones*/
.macro STORE_2x4 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL , LV1 ,LV2
/**/
vfmdb %v1,%v16,\ALPHA_REG
vfmdb %v2,%v17,\ALPHA_REG
vfmdb %v6,%v20,\ALPHA_REG
vfmdb %v7,%v21,\ALPHA_REG
vrepg %v4,%v1,1
vrepg %v5,%v6,1
la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
#if !defined(TRMMKERNEL)
adb %f1, 0(\CIJ_REG)
#endif
std %f1,0(\CIJ_REG)
#if !defined(TRMMKERNEL)
adb %f6, 8(\CIJ_REG)
#endif
std %f6,8(\CIJ_REG)
#if !defined(TRMMKERNEL)
adb %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#if !defined(TRMMKERNEL)
adb %f5,8(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
std %f5,8(\CIJ_REG,\LDC_BYTE_ORIGINAL)
/*add LDC_BYTE */
la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL )
vrepg %v4,%v2,1
vrepg %v5,%v7,1
#if !defined(TRMMKERNEL)
adb %f2,0(\CIJ_REG,\LV1)
#endif
std %f2,0(\CIJ_REG,\LV1)
#if !defined(TRMMKERNEL)
adb %f7,8(\CIJ_REG,\LV1)
#endif
std %f7,8(\CIJ_REG,\LV1)
#if !defined(TRMMKERNEL)
adb %f4,0(\CIJ_REG,\LV2)
#endif
std %f4,0(\CIJ_REG,\LV2)
#if !defined(TRMMKERNEL)
adb %f5,8(\CIJ_REG,\LV2)
#endif
std %f5,8(\CIJ_REG,\LV2)
la \CIJ_REG,16(\CIJ_REG)
.endm
.macro STORE_1x4 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL , LV1 ,LV2
vfmdb %v1,%v16,\ALPHA_REG
vfmdb %v2,%v17,\ALPHA_REG
vrepg %v4,%v1,1
vrepg %v5,%v2,1
la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
#if !defined(TRMMKERNEL)
adb %f1, 0(\CIJ_REG)
#endif
std %f1,0(\CIJ_REG)
#if !defined(TRMMKERNEL)
adb %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
/*add LDC_BYTE */
la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL )
#if !defined(TRMMKERNEL)
adb %f2,0(\CIJ_REG,\LV1)
#endif
std %f2,0(\CIJ_REG,\LV1)
#if !defined(TRMMKERNEL)
adb %f5,0(\CIJ_REG,\LV2)
#endif
std %f5,0(\CIJ_REG,\LV2)
la \CIJ_REG,8(\CIJ_REG)
.endm
.macro STORE_1x2 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL
/**/
vfmdb %v1,%v16,\ALPHA_REG
vrepg %v4,%v1,1
#if !defined(TRMMKERNEL)
adb %f1, 0(\CIJ_REG)
#endif
std %f1,0(\CIJ_REG)
#if !defined(TRMMKERNEL)
adb %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
#endif
std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
la \CIJ_REG,8(\CIJ_REG)
.endm
/****************************TRMM POINTER REFRESH MACROSES*************************/
.macro RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
/* ptrbb = bb;*/
lgr \PTR_B,\B_VAL /*refresh BPOINT*/
#else
/* ptrba =ptrba+ off*C_A;
ptrbb = bb + off*C_B;*/
.if \C_B==4
.if \C_A==8
sllg \PTR_B, \OFF_VAL,5
la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*4*/
agr \PTR_A,\PTR_B /*ptrba+off*4**/
la \PTR_B,0(\B_VAL,\PTR_B)
.elseif \C_A==4
sllg \PTR_B, \OFF_VAL,5
agr \PTR_A,\PTR_B /*ptrba+off*4**/
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==2
sllg \PTR_B, \OFF_VAL,4
la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/
agr \PTR_B, \PTR_B
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==1
sllg \PTR_B, \OFF_VAL,3
agr \PTR_A,\PTR_B /*ptrba+off*4**/
sllg \PTR_B, \OFF_VAL,5
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.endif
.elseif \C_B==2
.if \C_A==8
sllg \PTR_B, \OFF_VAL,6
agr \PTR_A,\PTR_B /*ptrba+off*8**/
sllg \PTR_B, \OFF_VAL,4
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==4
sllg \PTR_B, \OFF_VAL,4
la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/
agr \PTR_A,\PTR_B /*ptrba+off*2**/
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==2
sllg \PTR_B, \OFF_VAL,4
agr \PTR_A,\PTR_B /*ptrba+off*2**/
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==1
sllg \PTR_B, \OFF_VAL,3
la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/
agr \PTR_B,\PTR_B /* off+off**/
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.endif
.elseif \C_B==1
.if \C_A==8
sllg \PTR_B, \OFF_VAL,6
agr \PTR_A,\PTR_B /*ptrba+off*8**/
sllg \PTR_B, \OFF_VAL,3
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==4
sllg \PTR_B, \OFF_VAL,5
agr \PTR_A,\PTR_B /*ptrba+off*4**/
sllg \PTR_B, \OFF_VAL,3
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==2
sllg \PTR_B, \OFF_VAL,3
la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/
agr \PTR_A,\PTR_B /*ptrba+off*1**/
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.elseif \C_A==1
sllg \PTR_B, \OFF_VAL,3
agr \PTR_A,\PTR_B /*ptrba+off*1**/
la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
.endif
.endif
#endif
.endm
/**/
.macro RefreshTempBk TEMP_VAL,BK_VAL,OFF_VAL,INCR_A,INCR_B
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
/* temp = bk-off;*/
sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL
#elif defined(LEFT)
/* temp = off+INCR_A; // number of values in A */
la \TEMP_VAL,\INCR_A(\OFF_VAL)
#else
/* temp = off+INCR_B // number of values in B*/
la \TEMP_VAL,\INCR_B(\OFF_VAL)
#endif
.endm
.macro RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
/*temp = bk - off;*/
sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL
#ifdef LEFT
/*temp -= 8; // number of values in A*/
lay \TEMP_VAL,-\C_A(\TEMP_VAL)
#else
/*temp -= 4; // number of values in B*/
lay \TEMP_VAL,-\C_B(\TEMP_VAL)
#endif
/*ptrba += temp*C_A;
ptrbb += temp*C_B;*/
.if \C_A==8
sllg \TEMP_VAL, \TEMP_VAL,6
.elseif \C_A==4
sllg \TEMP_VAL, \TEMP_VAL,5 /*temp*4*/
.elseif \C_A==2
sllg \TEMP_VAL, \TEMP_VAL,4 /*temp*2*/
.elseif \C_A==1
sllg \TEMP_VAL, \TEMP_VAL,3 /*temp*1*/
.endif
la \PTR_A,0(\PTR_A,\TEMP_VAL) /*ptrba+temp*C_A*/
/*we do not need to refresh ptrbb. so lets ignore it*/
#endif
#ifdef LEFT
/*off += 8; // number of values in A*/
aghi \OFF_VAL,\C_A
#endif
.endm

857
kernel/zarch/dtrmm8x4V.S Normal file
View File

@ -0,0 +1,857 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2017/03/05 AbdelRauf (quickwritereader@gmail.com)
* BLASTEST : passed
* CTEST : passed
* TEST : passed
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
/*
#BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168]
offset=stack[176]
**********************************************************************************************/
/*Note: r0 can not be used as address disp register */
#define BM %r2
#define BM_CUR %r0
#define BN %r3
#define BN_CUR %r10
#define BK %r4
#define LDC_BYTE %r8
#define ALPHA %f0
#define ALPHA_VECT %v0
#define LOCAL_VAR1 %r9
#define LOCAL_VAR2 %r1
#define LOCAL_VAR3 %r11
#define A %r5
#define B %r6
#define CIJ %r7
#define CIJ_LOCAL %r12
#define OFF %r13
#define OFFSET %f8
#define ALIGN_4 .align 16
#define ALIGN_2 .align 8
#define PREFETCH_INS 1
/**************************Include kernel helper macrosses**********************************/
#include "dkernelMacros.S"
/***********************************DGEMM***********************************************************/
PROLOGUE
#if defined(TRMMKERNEL)
std OFFSET,40(%r15)
stmg %r6,%r13,48(%r15)
#else
stmg %r6,%r12,48(%r15)
#endif
lg CIJ, 160(%r15)
lg LOCAL_VAR1, 168(%r15)
#if defined(TRMMKERNEL)
lg OFF,176(%r15)
ldgr OFFSET ,OFF
#endif
srlg BN_CUR,BN,2
vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/
sllg LDC_BYTE, LOCAL_VAR1,3 /*calculate lcd stride with bytes double=8 x<<3 */
#if defined(TRMMKERNEL) && !defined(LEFT)
/*off = -offset;*/
lgdr LOCAL_VAR1,OFFSET
lcgr OFF,LOCAL_VAR1
#endif
cijle BN_CUR,0,.LX2
ALIGN_4
.LX4_BN:
#if defined(PREFETCH_INS)
pfd 1, 0(A)
pfd 1, 256(A)
pfd 1, 0(B)
pfd 1, 256(B)
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
/*off = offset;*/
lgdr OFF,OFFSET
#endif
srlg BM_CUR,BM,3
lgr LOCAL_VAR3,A
lgr CIJ_LOCAL,CIJ
cijle BM_CUR,0,.L4x4
ALIGN_4
.L8x4_BM: /*BM_CUR LOOP */
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,4
RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_8x4
cijle LOCAL_VAR1,0,.L8x4_mod
ALIGN_4
.L8x4_4_BK: /*BK_CUR LOOP */
#if defined(PREFETCH_INS)
pfd 1, 512(LOCAL_VAR3)
#endif
CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2
#if defined(PREFETCH_INS)
pfd 1, 512(LOCAL_VAR2)
#endif
brctg LOCAL_VAR1,.L8x4_4_BK
ALIGN_4
.L8x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L8x4_BK_Store
ALIGN_4
.L8x4_BK: /*BK_CUR LOOP */
CALC_8x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x4_BK
ALIGN_4
.L8x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
/*RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,L_VAR,PTR_A,C_A*/
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,4
#endif
brctg BM_CUR,.L8x4_BM
ALIGN_4
.L4x4:
tmll BM,4
jz .L2x4
ALIGN_4
.L4x4_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4
RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_4x4
cijle LOCAL_VAR1,0,.L4x4_mod
ALIGN_4
.L4x4_4_BK: /*BK_CUR LOOP */
CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x4_4_BK
ALIGN_4
.L4x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L4x4_BK_Store
ALIGN_4
.L4x4_BK: /*BK_CUR LOOP */
CALC_4x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x4_BK
ALIGN_4
.L4x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,4
#endif
ALIGN_2
.L2x4:
tmll BM,2
jz .L1x4
ALIGN_4
.L2x4_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4
RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_2x4
cijle LOCAL_VAR1,0,.L2x4_mod
ALIGN_4
.L2x4_4_BK: /*BK_CUR LOOP */
CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x4_4_BK
ALIGN_4
.L2x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L2x4_BK_Store
ALIGN_4
.L2x4_BK: /*BK_CUR LOOP */
CALC_2x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x4_BK
ALIGN_4
.L2x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,4
#endif
ALIGN_4
.L1x4:
tmll BM,1
jz .Lx4_INNER_END
ALIGN_4
.L1x4_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4
RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_1x4
cijle LOCAL_VAR1,0,.L1x4_mod
ALIGN_4
.L1x4_4_BK: /*BK_CUR LOOP */
CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x4_4_BK
ALIGN_4
.L1x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L1x4_BK_Store
ALIGN_4
.L1x4_BK: /*BK_CUR LOOP */
CALC_1x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x4_BK
ALIGN_4
.L1x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_1x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,4
#endif
ALIGN_2
.Lx4_INNER_END:
/*add LDC_BYTE_COPY to new*/
sllg LOCAL_VAR1,LDC_BYTE,2 /*op*4 */
#if defined(TRMMKERNEL) && !defined(LEFT)
aghi OFF,4
#endif
sllg LOCAL_VAR2,BK,5 /*op*4*sizeof(double) =op*32* 2**5 */
la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */
brctg BN_CUR,.LX4_BN
/*********************************X2 SECTION************************************************/
ALIGN_4
.LX2:
tmll BN,2
jz .Lx1
ALIGN_4
.Lx2_BN:
#if defined(TRMMKERNEL) && defined(LEFT)
/*off = offset;*/
lgdr OFF,OFFSET
#endif
srlg BM_CUR,BM,3
lgr LOCAL_VAR3,A
lgr CIJ_LOCAL,CIJ
cijle BM_CUR,0,.L4x2
ALIGN_4
.L8x2_BM: /*BM_CUR LOOP */
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,2
RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_8x2
cijle LOCAL_VAR1,0,.L8x2_mod
ALIGN_4
.L8x2_4_BK: /*BK_CUR LOOP */
#if defined(PREFETCH_INS)
pfd 1, 256(LOCAL_VAR3)
pfd 1,64(LOCAL_VAR2)
#endif
CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x2_4_BK
ALIGN_4
.L8x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L8x2_BK_Store
ALIGN_4
.L8x2_BK: /*BK_CUR LOOP */
CALC_8x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x2_BK
ALIGN_4
.L8x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,2
#endif
ALIGN_4
brctg BM_CUR,.L8x2_BM
ALIGN_2
.L4x2:
tmll BM,4
jz .L2x2
ALIGN_4
.L4x2_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2
RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_4x2
cijle LOCAL_VAR1,0,.L4x2_mod
ALIGN_4
.L4x2_4_BK: /*BK_CUR LOOP */
CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x2_4_BK
ALIGN_4
.L4x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L4x2_BK_Store
ALIGN_4
.L4x2_BK: /*BK_CUR LOOP */
CALC_4x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x2_BK
ALIGN_4
.L4x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,2
#endif
ALIGN_2
.L2x2:
tmll BM,2
jz .L1x2
ALIGN_4
.L2x2_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2
RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_2x2
cijle LOCAL_VAR1,0,.L2x2_mod
ALIGN_4
.L2x2_4_BK: /*BK_CUR LOOP */
CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x2_4_BK
ALIGN_4
.L2x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L2x2_BK_Store
ALIGN_4
.L2x2_BK: /*BK_CUR LOOP */
CALC_2x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x2_BK
ALIGN_4
.L2x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,2
#endif
ALIGN_2
.L1x2:
tmll BM,1
jz .Lx2_INNER_END
ALIGN_4
.L1x2_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2
RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_1x2
cijle LOCAL_VAR1,0,.L1x2_mod
ALIGN_4
.L1x2_4_BK: /*BK_CUR LOOP */
CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x2_4_BK
ALIGN_4
.L1x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L1x2_BK_Store
ALIGN_4
.L1x2_BK: /*BK_CUR LOOP */
CALC_1x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x2_BK
ALIGN_4
.L1x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_1x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,2
#endif
ALIGN_2
.Lx2_INNER_END:
/*add LDC_BYTE_COPY to new*/
la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*op*2 */
sllg LOCAL_VAR2,BK,4 /*op*2*sizeof(double) =op*16* 2**4 */
la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
#if defined(TRMMKERNEL) && !defined(LEFT)
aghi OFF,2
#endif
la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */
/*********************************X1 SECTION************************************************/
ALIGN_2
.Lx1:
tmll BN,1
jz .L_FUNC_END
ALIGN_4
.Lx1_BN:
#if defined(TRMMKERNEL) && defined(LEFT)
/*off = offset;*/
lgdr OFF,OFFSET
#endif
srlg BM_CUR,BM,3
lgr LOCAL_VAR3,A
lgr CIJ_LOCAL,CIJ
cijle BM_CUR,0,.L4x1
ALIGN_4
.L8x1_BM: /*BM_CUR LOOP */
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,1
RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_8x1
cijle LOCAL_VAR1,0,.L8x1_mod
ALIGN_4
.L8x1_4_BK: /*BK_CUR LOOP */
#if defined(PREFETCH_INS)
pfd 1, 256(LOCAL_VAR3)
#endif
CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x1_4_BK
ALIGN_4
.L8x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L8x1_BK_Store
ALIGN_4
.L8x1_BK: /*BK_CUR LOOP */
CALC_8x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x1_BK
ALIGN_4
.L8x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,1
#endif
ALIGN_4
brctg BM_CUR,.L8x1_BM
ALIGN_2
.L4x1:
tmll BM,4
jz .L2x1
ALIGN_4
.L4x1_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1
RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_4x1
cijle LOCAL_VAR1,0,.L4x1_mod
ALIGN_4
.L4x1_4_BK: /*BK_CUR LOOP */
CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x1_4_BK
ALIGN_4
.L4x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L4x1_BK_Store
ALIGN_4
.L4x1_BK: /*BK_CUR LOOP */
CALC_4x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x1_BK
ALIGN_4
.L4x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_4x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,1
#endif
ALIGN_2
.L2x1:
tmll BM,2
jz .L1x1
ALIGN_4
.L2x1_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1
RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_2x1
cijle LOCAL_VAR1,0,.L2x1_mod
ALIGN_4
.L2x1_4_BK: /*BK_CUR LOOP */
CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x1_4_BK
ALIGN_4
.L2x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L2x1_BK_Store
ALIGN_4
.L2x1_BK: /*BK_CUR LOOP */
CALC_2x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x1_BK
ALIGN_4
.L2x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_2x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,1
#endif
ALIGN_2
.L1x1:
tmll BM, 1
jz .Lx1_INNER_END
ALIGN_4
.L1x1_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1
RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_1x1
cijle LOCAL_VAR1,0,.L1x1_mod
ALIGN_4
.L1x1_4_BK: /*BK_CUR LOOP */
CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x1_4_BK
ALIGN_4
.L1x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L1x1_BK_Store
ALIGN_4
.L1x1_BK: /*BK_CUR LOOP */
CALC_1x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x1_BK
ALIGN_4
.L1x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,1
#endif
ALIGN_2
.Lx1_INNER_END:
/*add LDC_BYTE_COPY to new*/
sllg LOCAL_VAR2,BK,3 /*op*2*sizeof(double) =op*8* 2**3 */
la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */
#if defined(TRMMKERNEL) && !defined(LEFT)
aghi OFF,1
#endif
la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(double) */
ALIGN_2
.L_FUNC_END:
/*end*/
#if defined(TRMMKERNEL)
ld OFFSET,40(%r15)
lmg %r6,%r13,48(%r15)
#else
lmg %r6,%r12,48(%r15)
#endif
br %r14
.end

1143
kernel/zarch/skernelMacros.S Normal file

File diff suppressed because it is too large Load Diff

858
kernel/zarch/strmm8x4V.S Normal file
View File

@ -0,0 +1,858 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2017/03/06 AbdelRauf (quickwritereader@gmail.com)
* BLASTEST : passed
* CTEST : passed
* TEST : passed
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
/*
#BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168]
offset=stack[176]
**********************************************************************************************/
/*Note: r0 can not be used as address disp register */
#define BM %r2
#define BM_CUR %r0
#define BN %r3
#define BN_CUR %r10
#define BK %r4
#define LDC_BYTE %r8
#define ALPHA %f0
#define ALPHA_VECT %v0
#define LOCAL_VAR1 %r9
#define LOCAL_VAR2 %r1
#define LOCAL_VAR3 %r11
#define A %r5
#define B %r6
#define CIJ %r7
#define CIJ_LOCAL %r12
#define OFF %r13
#define OFFSET %f8
#define ALIGN_4 .align 16
#define ALIGN_2 .align 8
#define PREFETCH_INS 1
/**************************Include kernel helper macrosses**********************************/
#include "skernelMacros.S"
/***********************************DGEMM***********************************************************/
PROLOGUE
#if defined(TRMMKERNEL)
std OFFSET,40(%r15)
stmg %r6,%r13,48(%r15)
#else
stmg %r6,%r12,48(%r15)
#endif
lg CIJ, 160(%r15)
lg LOCAL_VAR1, 168(%r15)
#if defined(TRMMKERNEL)
lg OFF,176(%r15)
ldgr OFFSET ,OFF
#endif
srlg BN_CUR,BN,2
vrepf ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/
vldeb ALPHA_VECT,ALPHA_VECT
sllg LDC_BYTE, LOCAL_VAR1,2 /*calculate lcd stride with bytes float=4 x<<2 */
#if defined(TRMMKERNEL) && !defined(LEFT)
/*off = -offset;*/
lgdr LOCAL_VAR1,OFFSET
lcgr OFF,LOCAL_VAR1
#endif
cijle BN_CUR,0,.LX2
ALIGN_4
.LX4_BN:
#if defined(PREFETCH_INS)
pfd 1, 0(A)
pfd 1, 256(A)
pfd 1, 0(B)
pfd 1, 256(B)
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
/*off = offset;*/
lgdr OFF,OFFSET
#endif
srlg BM_CUR,BM,3
lgr LOCAL_VAR3,A
lgr CIJ_LOCAL,CIJ
cijle BM_CUR,0,.L4x4
ALIGN_4
.L8x4_BM: /*BM_CUR LOOP */
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,4
RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_8x4
cijle LOCAL_VAR1,0,.L8x4_mod
ALIGN_4
.L8x4_4_BK: /*BK_CUR LOOP */
#if defined(PREFETCH_INS)
pfd 1, 256(LOCAL_VAR3)
#endif
CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2
#if defined(PREFETCH_INS)
pfd 1, 256(LOCAL_VAR2)
#endif
brctg LOCAL_VAR1,.L8x4_4_BK
ALIGN_4
.L8x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,8,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L8x4_BK_Store
ALIGN_4
.L8x4_BK: /*BK_CUR LOOP */
CALC_8x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x4_BK
ALIGN_4
.L8x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
/*RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,L_VAR,PTR_A,C_A*/
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,4
#endif
brctg BM_CUR,.L8x4_BM
ALIGN_4
.L4x4:
tmll BM,4
jz .L2x4
ALIGN_4
.L4x4_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4
RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_4x4
cijle LOCAL_VAR1,0,.L4x4_mod
ALIGN_4
.L4x4_4_BK: /*BK_CUR LOOP */
CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x4_4_BK
ALIGN_4
.L4x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L4x4_BK_Store
ALIGN_4
.L4x4_BK: /*BK_CUR LOOP */
CALC_4x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x4_BK
ALIGN_4
.L4x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,4
#endif
ALIGN_2
.L2x4:
tmll BM,2
jz .L1x4
ALIGN_4
.L2x4_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4
RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_2x4
cijle LOCAL_VAR1,0,.L2x4_mod
ALIGN_4
.L2x4_4_BK: /*BK_CUR LOOP */
CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x4_4_BK
ALIGN_4
.L2x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L2x4_BK_Store
ALIGN_4
.L2x4_BK: /*BK_CUR LOOP */
CALC_2x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x4_BK
ALIGN_4
.L2x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,4
#endif
ALIGN_4
.L1x4:
tmll BM,1
jz .Lx4_INNER_END
ALIGN_4
.L1x4_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4
RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_1x4
cijle LOCAL_VAR1,0,.L1x4_mod
ALIGN_4
.L1x4_4_BK: /*BK_CUR LOOP */
CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x4_4_BK
ALIGN_4
.L1x4_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L1x4_BK_Store
ALIGN_4
.L1x4_BK: /*BK_CUR LOOP */
CALC_1x4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x4_BK
ALIGN_4
.L1x4_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_1x4 ALPHA ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,4
#endif
ALIGN_2
.Lx4_INNER_END:
/*add LDC_BYTE_COPY to new*/
sllg LOCAL_VAR1,LDC_BYTE,2 /*op*4 */
#if defined(TRMMKERNEL) && !defined(LEFT)
aghi OFF,4
#endif
sllg LOCAL_VAR2,BK,4 /*op*4*sizeof(float) =op*16* 2**4 */
la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(float) */
brctg BN_CUR,.LX4_BN
/*********************************X2 SECTION************************************************/
ALIGN_4
.LX2:
tmll BN,2
jz .Lx1
ALIGN_4
.Lx2_BN:
#if defined(TRMMKERNEL) && defined(LEFT)
/*off = offset;*/
lgdr OFF,OFFSET
#endif
srlg BM_CUR,BM,3
lgr LOCAL_VAR3,A
lgr CIJ_LOCAL,CIJ
cijle BM_CUR,0,.L4x2
ALIGN_4
.L8x2_BM: /*BM_CUR LOOP */
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,2
RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_8x2
cijle LOCAL_VAR1,0,.L8x2_mod
ALIGN_4
.L8x2_4_BK: /*BK_CUR LOOP */
#if defined(PREFETCH_INS)
pfd 1, 256(LOCAL_VAR3)
pfd 1,64(LOCAL_VAR2)
#endif
CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x2_4_BK
ALIGN_4
.L8x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,8,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L8x2_BK_Store
ALIGN_4
.L8x2_BK: /*BK_CUR LOOP */
CALC_8x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x2_BK
ALIGN_4
.L8x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,2
#endif
ALIGN_4
brctg BM_CUR,.L8x2_BM
ALIGN_2
.L4x2:
tmll BM,4
jz .L2x2
ALIGN_4
.L4x2_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2
RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_4x2
cijle LOCAL_VAR1,0,.L4x2_mod
ALIGN_4
.L4x2_4_BK: /*BK_CUR LOOP */
CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x2_4_BK
ALIGN_4
.L4x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L4x2_BK_Store
ALIGN_4
.L4x2_BK: /*BK_CUR LOOP */
CALC_4x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x2_BK
ALIGN_4
.L4x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,2
#endif
ALIGN_2
.L2x2:
tmll BM,2
jz .L1x2
ALIGN_4
.L2x2_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2
RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_2x2
cijle LOCAL_VAR1,0,.L2x2_mod
ALIGN_4
.L2x2_4_BK: /*BK_CUR LOOP */
CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x2_4_BK
ALIGN_4
.L2x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L2x2_BK_Store
ALIGN_4
.L2x2_BK: /*BK_CUR LOOP */
CALC_2x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x2_BK
ALIGN_4
.L2x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,2
#endif
ALIGN_2
.L1x2:
tmll BM,1
jz .Lx2_INNER_END
ALIGN_4
.L1x2_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2
RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_1x2
cijle LOCAL_VAR1,0,.L1x2_mod
ALIGN_4
.L1x2_4_BK: /*BK_CUR LOOP */
CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x2_4_BK
ALIGN_4
.L1x2_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L1x2_BK_Store
ALIGN_4
.L1x2_BK: /*BK_CUR LOOP */
CALC_1x2 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x2_BK
ALIGN_4
.L1x2_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_1x2 ALPHA ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,2
#endif
ALIGN_2
.Lx2_INNER_END:
/*add LDC_BYTE_COPY to new*/
la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*op*2 */
sllg LOCAL_VAR2,BK,3 /*op*2*sizeof(float) =op*8 2**3 */
la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
#if defined(TRMMKERNEL) && !defined(LEFT)
aghi OFF,2
#endif
la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(float) */
/*********************************X1 SECTION************************************************/
ALIGN_2
.Lx1:
tmll BN,1
jz .L_FUNC_END
ALIGN_4
.Lx1_BN:
#if defined(TRMMKERNEL) && defined(LEFT)
/*off = offset;*/
lgdr OFF,OFFSET
#endif
srlg BM_CUR,BM,3
lgr LOCAL_VAR3,A
lgr CIJ_LOCAL,CIJ
cijle BM_CUR,0,.L4x1
ALIGN_4
.L8x1_BM: /*BM_CUR LOOP */
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,1
RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_8x1
cijle LOCAL_VAR1,0,.L8x1_mod
ALIGN_4
.L8x1_4_BK: /*BK_CUR LOOP */
#if defined(PREFETCH_INS)
pfd 1, 256(LOCAL_VAR3)
#endif
CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x1_4_BK
ALIGN_4
.L8x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,8,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L8x1_BK_Store
ALIGN_4
.L8x1_BK: /*BK_CUR LOOP */
CALC_8x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L8x1_BK
ALIGN_4
.L8x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,1
#endif
ALIGN_4
brctg BM_CUR,.L8x1_BM
ALIGN_2
.L4x1:
tmll BM,4
jz .L2x1
ALIGN_4
.L4x1_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1
RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_4x1
cijle LOCAL_VAR1,0,.L4x1_mod
ALIGN_4
.L4x1_4_BK: /*BK_CUR LOOP */
CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x1_4_BK
ALIGN_4
.L4x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L4x1_BK_Store
ALIGN_4
.L4x1_BK: /*BK_CUR LOOP */
CALC_4x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L4x1_BK
ALIGN_4
.L4x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_4x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,1
#endif
ALIGN_2
.L2x1:
tmll BM,2
jz .L1x1
ALIGN_4
.L2x1_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1
RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_2x1
cijle LOCAL_VAR1,0,.L2x1_mod
ALIGN_4
.L2x1_4_BK: /*BK_CUR LOOP */
CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x1_4_BK
ALIGN_4
.L2x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L2x1_BK_Store
ALIGN_4
.L2x1_BK: /*BK_CUR LOOP */
CALC_2x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L2x1_BK
ALIGN_4
.L2x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_2x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,1
#endif
ALIGN_2
.L1x1:
tmll BM, 1
jz .Lx1_INNER_END
ALIGN_4
.L1x1_BM: /*BM start*/
#if defined(TRMMKERNEL)
/* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1
RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
srl LOCAL_VAR1,2
#else
srlg LOCAL_VAR1,BK,2 /*refresh BK*/
lgr LOCAL_VAR2,B /*refresh BPOINT*/
#endif
ZERO_CVEC_1x1
cijle LOCAL_VAR1,0,.L1x1_mod
ALIGN_4
.L1x1_4_BK: /*BK_CUR LOOP */
CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x1_4_BK
ALIGN_4
.L1x1_mod:
#if defined(TRMMKERNEL)
RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
nill LOCAL_VAR1,3
#else
la LOCAL_VAR1,3(0,0)
NGR LOCAL_VAR1,BK /*refresh BK*/
#endif
jz .L1x1_BK_Store
ALIGN_4
.L1x1_BK: /*BK_CUR LOOP */
CALC_1x1 LOCAL_VAR3,LOCAL_VAR2
brctg LOCAL_VAR1,.L1x1_BK
ALIGN_4
.L1x1_BK_Store:
/*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE
#if defined(TRMMKERNEL)
RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,1
#endif
ALIGN_2
.Lx1_INNER_END:
/*add LDC_BYTE_COPY to new*/
sllg LOCAL_VAR2,BK,2 /*op*1*sizeof(float) =op*4 2**2 */
la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */
#if defined(TRMMKERNEL) && !defined(LEFT)
aghi OFF,1
#endif
la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(float) */
ALIGN_2
.L_FUNC_END:
/*end*/
#if defined(TRMMKERNEL)
ld OFFSET,40(%r15)
lmg %r6,%r13,48(%r15)
#else
lmg %r6,%r12,48(%r15)
#endif
br %r14
.end

10
param.h
View File

@ -2632,8 +2632,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
@ -2644,17 +2644,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_P 320
#define DGEMM_DEFAULT_P 320
#define CGEMM_DEFAULT_P 96
#define ZGEMM_DEFAULT_P 224
#define SGEMM_DEFAULT_Q 240
#define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 384
#define CGEMM_DEFAULT_Q 120
#define ZGEMM_DEFAULT_Q 352
#define SGEMM_DEFAULT_R 12288
#define SGEMM_DEFAULT_R 8192
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048