From f82fa802d164a064da257bb459c3d13629fd56f8 Mon Sep 17 00:00:00 2001 From: ZhangDanfeng <467688405@qq.com> Date: Thu, 4 Jun 2020 02:08:48 +0800 Subject: [PATCH 1/2] Insert prefetch Signed-off-by: ZhangDanfeng <467688405@qq.com> --- kernel/arm64/sgemm_kernel_8x8_cortexa53.S | 664 +++++++++++----------- 1 file changed, 319 insertions(+), 345 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S index 4fcce38d5..fec0c9ae9 100644 --- a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -57,6 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha3 s15 #define alphaV3 v15.s[0] +#define A_PRE_SIZE 640 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 96 + // 00 origM // 01 origN // 02 origK @@ -147,13 +151,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_I - ld1 {v0.4s, v1.4s}, [pA], #32 - ld1 {v4.4s, v5.4s}, [pB], #32 + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 + ldr d2, [pA], #8 ldr d6, [pB], #8 ldr d3, [pA, #8] ldr d7, [pB, #8] - ldr x22, [pA], #16 fmul v16.4s, v0.4s, v4.s[0] ldr x26, [pB], #16 @@ -163,7 +167,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr x27, [pB], #8 fmul v19.4s, v1.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmul v22.4s, v0.4s, v4.s[3] fmul v23.4s, v1.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] @@ -194,7 +200,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v19.4s, v1.4s, v4.s[1] ldr x27, [pB], #8 fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] @@ -225,7 +233,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v19.4s, v3.4s, v6.s[1] ldr x25, [pB], #8 fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] @@ -248,7 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v18.4s, v2.4s, v6.s[1] fmla v19.4s, v3.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] @@ -262,21 +274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 fmla v16.4s, v0.4s, v4.s[0] fmla v17.4s, v1.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v19.4s, v1.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] @@ -290,66 +298,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x8 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v2.4s, v3.4s}, [pCRow1] + ldp q2, q3, [pCRow1] fmla v2.4s, v18.4s, alphaV2 fmla v3.4s, v19.4s, alphaV3 - st1 {v2.4s, v3.4s}, [pCRow1] + stp q2, q3, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v4.4s, v5.4s}, [pCRow2] + ldp q4, q5, [pCRow2] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow2] + stp q4, q5, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v6.4s, v7.4s}, [pCRow1] + ldp q6, q7, [pCRow1] fmla v6.4s, v22.4s, alphaV2 fmla v7.4s, v23.4s, alphaV3 - st1 {v6.4s, v7.4s}, [pCRow1] + stp q6, q7, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v0.4s, v1.4s}, [pCRow2] + ldp q0, q1, [pCRow2] fmla v0.4s, v24.4s, alphaV0 fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + stp q0, q1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v2.4s, v3.4s}, [pCRow1] + ldp q2, q3, [pCRow1] fmla v2.4s, v26.4s, alphaV2 fmla v3.4s, v27.4s, alphaV3 - st1 {v2.4s, v3.4s}, [pCRow1] + stp q2, q3, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v4.4s, v5.4s}, [pCRow2] + ldp q4, q5, [pCRow2] fmla v4.4s, v28.4s, alphaV0 fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow2] + stp q4, q5, [pCRow2] - ld1 {v6.4s, v7.4s}, [pCRow1] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q6, q7, [pCRow1] fmla v6.4s, v30.4s, alphaV2 fmla v7.4s, v31.4s, alphaV3 - st1 {v6.4s, v7.4s}, [pCRow1] + stp q6, q7, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ - .macro INIT4x8 fmov s16, wzr fmov s18, wzr @@ -362,19 +378,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_I - ld1 {v0.4s}, [pA], #16 - ld1 {v4.4s, v5.4s}, [pB], #32 + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 ldr d2, [pA], #8 ldr d6, [pB], #8 ldr d7, [pB, #8] - ldr x21, [pA], #8 + ldr x22, [pA], #8 fmul v16.4s, v0.4s, v4.s[0] ldr x26, [pB], #16 fmul v18.4s, v0.4s, v4.s[1] ldr x27, [pB], #8 fmul v20.4s, v0.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmul v24.4s, v0.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] @@ -388,13 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov v4.d[1], x24 ldr d7, [pB, #8] fmov v5.d[1], x25 + ldr x22, [pA], #8 fmla v16.4s, v0.4s, v4.s[0] - ldr x21, [pA], #8 - fmla v18.4s, v0.4s, v4.s[1] ldr x26, [pB], #16 - fmla v20.4s, v0.4s, v4.s[2] + fmla v18.4s, v0.4s, v4.s[1] ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] @@ -403,18 +421,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M2 ldr d0, [pA], #8 - fmov v2.d[1], x21 + fmov v2.d[1], x22 ldr d4, [pB], #8 fmov v6.d[1], x26 ldr d5, [pB, #8] fmov v7.d[1], x27 - fmla v16.4s, v2.4s, v6.s[0] ldr x20, [pA], #8 - fmla v18.4s, v2.4s, v6.s[1] + fmla v16.4s, v2.4s, v6.s[0] ldr x24, [pB], #16 - fmla v20.4s, v2.4s, v6.s[2] + fmla v18.4s, v2.4s, v6.s[1] ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] @@ -422,13 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmov v2.d[1], x21 + fmov v2.d[1], x22 fmov v6.d[1], x26 fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] @@ -436,17 +456,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 fmla v16.4s, v0.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] @@ -456,49 +473,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x8 add pCRow1, pCRow0, LDC - ld1 {v0.4s}, [pCRow0] + ldr q0, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - st1 {v0.4s}, [pCRow0] + str q0, [pCRow0] add pCRow2, pCRow1, LDC - ld1 {v2.4s}, [pCRow1] + ldr q2, [pCRow1] fmla v2.4s, v18.4s, alphaV2 - st1 {v2.4s}, [pCRow1] + str q2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.4s}, [pCRow2] + ldr q4, [pCRow2] fmla v4.4s, v20.4s, alphaV0 - st1 {v4.4s}, [pCRow2] + str q4, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v6.4s}, [pCRow1] + ldr q6, [pCRow1] fmla v6.4s, v22.4s, alphaV2 - st1 {v6.4s}, [pCRow1] + str q6, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v0.4s}, [pCRow2] + ldr q0, [pCRow2] fmla v0.4s, v24.4s, alphaV0 - st1 {v0.4s}, [pCRow2] + str q0, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v2.4s}, [pCRow1] + ldr q2, [pCRow1] fmla v2.4s, v26.4s, alphaV2 - st1 {v2.4s}, [pCRow1] + str q2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.4s}, [pCRow2] + ldr q4, [pCRow2] fmla v4.4s, v28.4s, alphaV0 - st1 {v4.4s}, [pCRow2] + str q4, [pCRow2] - ld1 {v6.4s}, [pCRow1] + ldr q6, [pCRow1] fmla v6.4s, v30.4s, alphaV2 - st1 {v6.4s}, [pCRow1] + str q6, [pCRow1] add pCRow0, pCRow0, #16 .endm @@ -517,17 +534,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldr d0, [pA], #8 + ldp q4, q5, [pB], #32 fmla v16.2s, v0.2s, v4.s[0] fmla v18.2s, v0.2s, v4.s[1] fmla v20.2s, v0.2s, v4.s[2] fmla v22.2s, v0.2s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.2s, v0.2s, v5.s[0] fmla v26.2s, v0.2s, v5.s[1] fmla v28.2s, v0.2s, v5.s[2] @@ -537,49 +551,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x8 add pCRow1, pCRow0, LDC - ld1 {v0.2s}, [pCRow0] + ldr d0, [pCRow0] fmla v0.2s, v16.2s, alphaV0 - st1 {v0.2s}, [pCRow0] + str d0, [pCRow0] add pCRow2, pCRow1, LDC - ld1 {v2.2s}, [pCRow1] + ldr d2, [pCRow1] fmla v2.2s, v18.2s, alphaV2 - st1 {v2.2s}, [pCRow1] + str d2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.2s}, [pCRow2] + ldr d4, [pCRow2] fmla v4.2s, v20.2s, alphaV0 - st1 {v4.2s}, [pCRow2] + str d4, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v6.2s}, [pCRow1] + ldr d6, [pCRow1] fmla v6.2s, v22.2s, alphaV2 - st1 {v6.2s}, [pCRow1] + str d6, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v0.2s}, [pCRow2] + ldr d0, [pCRow2] fmla v0.2s, v24.2s, alphaV0 - st1 {v0.2s}, [pCRow2] + str d0, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v2.2s}, [pCRow1] + ldr d2, [pCRow1] fmla v2.2s, v26.2s, alphaV2 - st1 {v2.2s}, [pCRow1] + str d2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.2s}, [pCRow2] + ldr d4, [pCRow2] fmla v4.2s, v28.2s, alphaV0 - st1 {v4.2s}, [pCRow2] + str d4, [pCRow2] - ld1 {v6.2s}, [pCRow1] + ldr d6, [pCRow1] fmla v6.2s, v30.2s, alphaV2 - st1 {v6.2s}, [pCRow1] + str d6, [pCRow1] add pCRow0, pCRow0, #8 .endm @@ -598,17 +612,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL1x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ldr s0, [pA] - add pA, pA, #4 + ldp q4, q5, [pB], #32 + ldr s0, [pA], #4 fmla s16, s0, v4.s[0] fmla s18, s0, v4.s[1] fmla s20, s0, v4.s[2] fmla s22, s0, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla s24, s0, v5.s[0] fmla s26, s0, v5.s[1] fmla s28, s0, v5.s[2] @@ -620,47 +631,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pCRow0] fmla s0, s16, alphaV0 - str s0, [pCRow0] + str s0, [pCRow0] add pCRow2, pCRow1, LDC ldr s2, [pCRow1] fmla s2, s18, alphaV2 - str s2, [pCRow1] + str s2, [pCRow1] add pCRow1, pCRow2, LDC ldr s4, [pCRow2] fmla s4, s20, alphaV0 - str s4, [pCRow2] + str s4, [pCRow2] add pCRow2, pCRow1, LDC ldr s6, [pCRow1] fmla s6, s22, alphaV2 - str s6, [pCRow1] + str s6, [pCRow1] add pCRow1, pCRow2, LDC ldr s0, [pCRow2] fmla s0, s24, alphaV0 - str s0, [pCRow2] + str s0, [pCRow2] add pCRow2, pCRow1, LDC ldr s2, [pCRow1] fmla s2, s26, alphaV2 - str s2, [pCRow1] + str s2, [pCRow1] add pCRow1, pCRow2, LDC ldr s4, [pCRow2] fmla s4, s28, alphaV0 - str s4, [pCRow2] + str s4, [pCRow2] ldr s6, [pCRow1] fmla s6, s30, alphaV2 - str s6, [pCRow1] + str s6, [pCRow1] add pCRow0, pCRow0, #4 .endm @@ -679,118 +690,137 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.4s}, [pB], #16 - ld1 {v0.4s, v1.4s}, [pA], #32 + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 - ldr d9, [pB], #8 ldr d2, [pA], #8 + ldr d6, [pB], #8 ldr d3, [pA, #8] - fmul v16.4s, v0.4s, v8.s[0] - ldr x25, [pB], #8 - fmul v17.4s, v1.4s, v8.s[0] + fmul v16.4s, v0.4s, v4.s[0] ldr x22, [pA], #16 - fmul v20.4s, v0.4s, v8.s[1] + fmul v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #8 + fmul v18.4s, v0.4s, v4.s[1] ldr x23, [pA], #8 - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - fmul v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.s[3] - fmul v29.4s, v1.4s, v8.s[3] + fmul v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] .endm .macro KERNEL8x4_M1 - ldr d9, [pB], #8 - fmov v8.d[1], x24 ldr d2, [pA], #8 fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 ldr d3, [pA, #8] fmov v1.d[1], x21 - fmla v16.4s, v0.4s, v8.s[0] - ldr x25, [pB], #8 - fmla v17.4s, v1.4s, v8.s[0] ldr x22, [pA], #16 - fmla v20.4s, v0.4s, v8.s[1] + fmla v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #8 + fmla v17.4s, v1.4s, v4.s[0] ldr x23, [pA], #8 - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v8.s[2] - fmla v25.4s, v1.4s, v8.s[2] - fmla v28.4s, v0.4s, v8.s[3] - fmla v29.4s, v1.4s, v8.s[3] + fmla v18.4s, v0.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] .endm .macro KERNEL8x4_M2 - ldr d8, [pB], #8 - fmov v9.d[1], x25 ldr d0, [pA], #8 fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 ldr d1, [pA, #8] fmov v3.d[1], x23 - fmla v16.4s, v2.4s, v9.s[0] - ldr x24, [pB], #8 - fmla v17.4s, v3.4s, v9.s[0] ldr x20, [pA], #16 - fmla v20.4s, v2.4s, v9.s[1] + fmla v16.4s, v2.4s, v6.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v6.s[0] ldr x21, [pA], #8 - fmla v21.4s, v3.4s, v9.s[1] - fmla v24.4s, v2.4s, v9.s[2] - fmla v25.4s, v3.4s, v9.s[2] - fmla v28.4s, v2.4s, v9.s[3] - fmla v29.4s, v3.4s, v9.s[3] + fmla v18.4s, v2.4s, v6.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] .endm .macro KERNEL8x4_E - fmov v9.d[1], x25 fmov v2.d[1], x22 + fmov v6.d[1], x26 fmov v3.d[1], x23 - fmla v16.4s, v2.4s, v9.s[0] - fmla v17.4s, v3.4s, v9.s[0] - fmla v20.4s, v2.4s, v9.s[1] - fmla v21.4s, v3.4s, v9.s[1] - fmla v24.4s, v2.4s, v9.s[2] - fmla v25.4s, v3.4s, v9.s[2] - fmla v28.4s, v2.4s, v9.s[3] - fmla v29.4s, v3.4s, v9.s[3] + fmla v16.4s, v2.4s, v6.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x4_SUB - ld1 {v8.4s}, [pB], #16 - ld1 {v0.4s, v1.4s}, [pA], #32 - fmla v16.4s, v0.4s, v8.s[0] - fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v8.s[2] - fmla v25.4s, v1.4s, v8.s[2] - fmla v28.4s, v0.4s, v8.s[3] - fmla v29.4s, v1.4s, v8.s[3] + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] .endm .macro SAVE8x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v0.4s, v1.4s}, [pCRow2] - fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + ldp q0, q1, [pCRow2] + fmla v0.4s, v20.4s, alphaV0 + fmla v1.4s, v21.4s, alphaV1 + stp q0, q1, [pCRow2] - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] + fmla v4.4s, v22.4s, alphaV0 + fmla v5.4s, v23.4s, alphaV1 + stp q4, q5, [pCRow1] add pCRow0, pCRow0, #32 .endm @@ -800,139 +830,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s18, wzr + fmov s20, wzr + fmov s22, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldr q0, [pA], #16 + ldr q4, [pB], #16 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] - - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] - - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] - - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - prfm PLDL1KEEP, [pB, #512] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr x22, [pA], #8 + ldr x26, [pB], #8 + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr x20, [pA], #8 + ldr x24, [pB], #8 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldr q0, [pA], #16 + ldr q4, [pB], #16 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] - - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] .endm .macro SAVE4x4 - ld1 {v8.2s, v9.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] add pCRow1, pCRow0, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q1, [pCRow1] + fmla v1.4s, v18.4s, alphaV2 + str q1, [pCRow1] add pCRow2, pCRow1, LDC - ld1 {v8.2s, v9.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV0 - fmla v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] + ldr q2, [pCRow2] + fmla v2.4s, v20.4s, alphaV0 + str q2, [pCRow2] add pCRow1, pCRow2, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV2 - fmla v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q3, [pCRow1] + fmla v3.4s, v22.4s, alphaV2 + str q3, [pCRow1] add pCRow0, pCRow0, #16 .endm @@ -941,42 +921,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x4 fmov s16, wzr - fmov s20, s16 - fmov s24, s20 - fmov s28, s16 + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldr d0, [pA], #8 + ldr q4, [pB], #16 - fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] .endm .macro SAVE2x4 - ld1 {v8.2s}, [pCRow0] + ldr d8, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + str d8, [pCRow0] add pCRow1, pCRow0, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] + ldr d12, [pCRow1] + fmla v12.2s, v18.2s, alphaV1 + str d12, [pCRow1] add pCRow2, pCRow1, LDC - ld1 {v8.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] + ldr d8, [pCRow2] + fmla v8.2s, v20.2s, alphaV2 + str d8, [pCRow2] add pCRow1, pCRow2, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + ldr d12, [pCRow1] + fmla v12.2s, v22.2s, alphaV3 + str d12, [pCRow1] add pCRow0, pCRow0, #8 .endm @@ -1023,39 +1001,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT8x2 fmov s16, wzr fmov s17, s16 - fmov s20, s17 - fmov s21, s16 + fmov s18, s17 + fmov s19, s16 .endm .macro KERNEL8x2_SUB - ld1 {v8.2s}, [pB] - add pB, pB, #8 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldr d4, [pB], #8 - fmla v16.4s, v0.4s, v8.s[0] - fmla v17.4s, v1.4s, v8.s[0] - - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] .endm .macro SAVE8x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] add pCRow0, pCRow0, #32 .endm @@ -1162,23 +1138,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x1_SUB - ldr s8, [pB] - add pB , pB, #4 + ldr s4, [pB], #4 + ldp q0, q1, [pA], #32 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - - fmla v16.4s, v0.4s, v8.s[0] - fmla v17.4s, v1.4s, v8.s[0] + fmla v16.4s, v0.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v1.4s, v4.s[0] .endm .macro SAVE8x1 - ld1 {v0.4s, v1.4s}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 .endm @@ -1247,13 +1221,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA , pA, #4 - fmadd s16, s0, s8, s16 + fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 - ldr s8, [pCRow0] + ldr s8, [pCRow0] fmla s8, s16, alphaV0 - str s8, [pCRow0] + str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm @@ -1290,8 +1264,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB mov counterJ, origN - asr counterJ, counterJ, #3 // J = J / 8 - cmp counterJ, #0 + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 ble .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ @@ -1308,15 +1282,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L8_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 // counterI = counterI / 8 - cmp counterI, #0 + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 ble .Lsgemm_kernel_L8_M4_BEGIN .Lsgemm_kernel_L8_M8_20: mov pB, origPB - asr counterL , origK, #3 // L = K / 8 + asr counterL , origK, #3 // L = K / 8 cmp counterL , #2 // is there at least 16 to do? blt .Lsgemm_kernel_L8_M8_32 @@ -1415,7 +1389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #1 // L = K / 2 + asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt .Lsgemm_kernel_L8_M4_32 @@ -1487,7 +1461,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L8_M2_40 @@ -1538,7 +1512,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L8_M1_40 @@ -1603,15 +1577,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 // counterI = counterI / 8 - cmp counterI, #0 + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 ble .Lsgemm_kernel_L4_M4_BEGIN .Lsgemm_kernel_L4_M8_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 + asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt .Lsgemm_kernel_L4_M8_32 @@ -1683,7 +1657,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #1 // L = K / 2 + asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt .Lsgemm_kernel_L4_M4_32 @@ -1755,7 +1729,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L4_M2_40 @@ -1806,7 +1780,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L4_M1_40 @@ -1867,7 +1841,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 // counterI = counterI / 8 + asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 ble .Lsgemm_kernel_L2_M4_BEGIN @@ -2041,7 +2015,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble .Lsgemm_kernel_L2_M1_40 @@ -2100,7 +2074,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 + asr counterI, counterI, #3 cmp counterI, #0 ble .Lsgemm_kernel_L1_M4_BEGIN @@ -2223,7 +2197,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L1_M2_40 @@ -2274,7 +2248,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L1_M1_40 From 9b7877ccf1bd77a24adacd79c3b91addc86d2408 Mon Sep 17 00:00:00 2001 From: ZhangDanfeng <467688405@qq.com> Date: Thu, 4 Jun 2020 02:09:38 +0800 Subject: [PATCH 2/2] sgemm copy source init Signed-off-by: ZhangDanfeng <467688405@qq.com> --- kernel/arm64/KERNEL.CORTEXA53 | 9 +- kernel/arm64/sgemm_ncopy_8.S | 562 +++++++++++++++++++++++++++ kernel/arm64/sgemm_tcopy_8.S | 707 ++++++++++++++++++++++++++++++++++ 3 files changed, 1270 insertions(+), 8 deletions(-) create mode 100644 kernel/arm64/sgemm_ncopy_8.S create mode 100644 kernel/arm64/sgemm_tcopy_8.S diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index 4219acf98..eba38a92e 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -126,16 +126,9 @@ endif SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -ifeq ($(SGEMM_UNROLL_N), 16) + SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -else -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -endif -ifeq ($(SGEMM_UNROLL_N), 4) SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -else -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sgemm_ncopy_8.S b/kernel/arm64/sgemm_ncopy_8.S new file mode 100644 index 000000000..f99b1d992 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_8.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 +#define A05 x9 +#define A06 x10 +#define A07 x11 +#define A08 x12 + +#define I x13 +#define J x14 +#define K x15 + +#define TEMP1 x16 +#define TEMP2 x17 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x8 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v12.s[0], v0.s[2] + ins v14.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + ins v12.s[1], v1.s[2] + ins v14.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v12.s[2], v2.s[2] + ins v14.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + ins v12.s[3], v3.s[2] + ins v14.s[3], v3.s[3] + + ldr q4, [A05], #16 + ldr q5, [A06], #16 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v13.s[0], v4.s[2] + ins v15.s[0], v4.s[3] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + ins v13.s[1], v5.s[2] + ins v15.s[1], v5.s[3] + + ldr q6, [A07], #16 + ldr q7, [A08], #16 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v13.s[2], v6.s[2] + ins v15.s[2], v6.s[3] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + ins v13.s[3], v7.s[2] + ins v15.s[3], v7.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 +.endm + +.macro COPY2x8 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + + ldr d6, [A07], #8 + ldr d7, [A08], #8 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY1x8 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + ldr s4, [A05], #4 + ldr s5, [A06], #4 + ins v9.s[0], v4.s[0] + ins v9.s[1], v5.s[0] + + ldr s6, [A07], #4 + ldr s7, [A08], #4 + ins v9.s[2], v6.s[0] + ins v9.s[3], v7.s[0] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY4x4 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY2x4 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY1x4 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + st1 {v8.4s}, [B00], #16 +.endm + +.macro COPY4x2 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 +.endm + +.macro COPY2x2 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + st1 {v8.2s, v9.2s}, [B00], #16 +.endm + +.macro COPY1x2 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + st1 {v8.2s}, [B00], #8 +.endm + +.macro COPY1x1 + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Lsgemm_ncopy_L8_BEGIN: + + asr J, N, #3 // J = N / 8 + cmp J, #0 + ble .Lsgemm_ncopy_L4_BEGIN + + .align 5 +.Lsgemm_ncopy_L8_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A00, A08, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_40 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A01 + + .align 5 +.Lsgemm_tcopy_L8_warnup_1: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_1 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A02 + + .align 5 +.Lsgemm_tcopy_L8_warnup_2: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_2 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A03 + + .align 5 +.Lsgemm_tcopy_L8_warnup_3: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_3 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A04 + + .align 5 +.Lsgemm_tcopy_L8_warnup_4: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_4 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A05 + + .align 5 +.Lsgemm_tcopy_L8_warnup_5: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_5 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A06 + + .align 5 +.Lsgemm_tcopy_L8_warnup_6: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_6 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A07 + + .align 5 +.Lsgemm_tcopy_L8_warnup_7: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_7 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A08 + + .align 5 +.Lsgemm_tcopy_L8_warnup_8: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_8 + + .align 5 +.Lsgemm_ncopy_L8_M4_20: + + COPY4x8 + + subs I, I, #1 + bne .Lsgemm_ncopy_L8_M4_20 + +.Lsgemm_ncopy_L8_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_60 + + COPY2x8 + +.Lsgemm_ncopy_L8_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_END + + COPY1x8 + +.Lsgemm_ncopy_L8_M4_END: + + subs J , J, #1 // j-- + bne .Lsgemm_ncopy_L8_M4_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L4_BEGIN: + + tst N, #7 + ble .Lsgemm_ncopy_L999 + + tst N, #4 + ble .Lsgemm_ncopy_L2_BEGIN + +.Lsgemm_ncopy_L4_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_40 + + .align 5 +.Lsgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I, I, #1 + bne .Lsgemm_ncopy_L4_M4_20 + +.Lsgemm_ncopy_L4_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_60 + + COPY2x4 + +.Lsgemm_ncopy_L4_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_END + + COPY1x4 + +.Lsgemm_ncopy_L4_M4_END: + + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L2_BEGIN: + + tst N, #2 + ble .Lsgemm_ncopy_L1_BEGIN + +.Lsgemm_ncopy_L2_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_40 + + .align 5 +.Lsgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Lsgemm_ncopy_L2_M4_20 + + +.Lsgemm_ncopy_L2_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_60 + + COPY2x2 + +.Lsgemm_ncopy_L2_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_END + + COPY1x2 + +.Lsgemm_ncopy_L2_M4_END: + +.Lsgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Lsgemm_ncopy_L999 + +.Lsgemm_ncopy_L1_M1_BEGIN: + + mov A01, A00 + + mov I, M + cmp I, #0 + ble .Lsgemm_ncopy_L1_M1_END + + .align 5 +.Lsgemm_ncopy_L1_M1_20: + + COPY1x1 + + subs I, I, #1 + bne .Lsgemm_ncopy_L1_M1_20 + +.Lsgemm_ncopy_L1_M1_END: + +.Lsgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE diff --git a/kernel/arm64/sgemm_tcopy_8.S b/kernel/arm64/sgemm_tcopy_8.S new file mode 100644 index 000000000..7d81ba266 --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_8.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x18 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + add B02, B02, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B02] + add B02, B02, #16 + stp d6, d7, [B02] + add B02, B02, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ldr d6, [A07], #8 + ldr d7, [A08], #8 + + stp s4, s5, [B03] + add B03, B03, #8 + stp s6, s7, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + + add B01, B01, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + + add B02, B02, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B01] + add B01, B01, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B02] + add B02, B02, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B03] + + add B03, B03, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B00] + + add B00, B00, M8 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B01] + + add B01, B01, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B02] + + add B02, B02, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B03] + + add B03, B03, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-8 + and B02 , N , #-4 + and B03 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + + lsl M8, M, #5 // M8 = M * 8 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #256 // B = B + 8 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M8_40 + + .align 5 +.Lsgemm_tcopy_L8_M8_20: + + COPY8x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M8_20 + +.Lsgemm_tcopy_L8_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L8_M8_60 + + COPY4x8 + +.Lsgemm_tcopy_L8_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M8_80 + + COPY2x8 + +.Lsgemm_tcopy_L8_M8_80: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M8_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M8_END: + + subs J, J, #1 // j-- + bne .Lsgemm_tcopy_L8_M8_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #128 // B = B + 4 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M8_40 + + .align 5 +.Lsgemm_tcopy_L4_M8_20: + + COPY8x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M8_20 + +.Lsgemm_tcopy_L4_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L4_M8_60 + + COPY4x4 + +.Lsgemm_tcopy_L4_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M8_80 + + COPY2x4 + +.Lsgemm_tcopy_L4_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L4_M8_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #64 // B = B + 2 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M8_40 + + .align 5 +.Lsgemm_tcopy_L2_M8_20: + + COPY8x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M8_20 + +.Lsgemm_tcopy_L2_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L2_M8_60 + + COPY4x2 + +.Lsgemm_tcopy_L2_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M8_80 + + COPY2x2 + +.Lsgemm_tcopy_L2_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M8_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #3 // I = M / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M8_40 + + .align 5 +.Lsgemm_tcopy_L1_M8_20: + + COPY8x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M8_20 + +.Lsgemm_tcopy_L1_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L1_M8_60 + + COPY4x1 + +.Lsgemm_tcopy_L1_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M8_80 + + COPY2x1 + +.Lsgemm_tcopy_L1_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M8_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M8_END: + +.Lsgemm_tcopy_L999: + + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE