From edb423d772c3f91841fbad9afbff024aa109b893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=B8=B9=E6=9E=AB?= Date: Wed, 20 May 2020 21:52:49 +0800 Subject: [PATCH] align general register using to strmm_kernel_8x8 --- kernel/arm64/sgemm_kernel_8x8_cortexa53.S | 242 +++++++++++----------- 1 file changed, 120 insertions(+), 122 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S index 0c9629eab..4fcce38d5 100644 --- a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ - #define ASSEMBLER #include "common.h" @@ -78,14 +77,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 17 // 18 must save // 19 must save -// 20 must save -// 21 must save -// 22 must save -// 23 must save -// 24 must save -// 25 must save -// 26 must save -// 27 must save +// 20 must save pA0_2, pA0_3 +// 21 must save pA0_6, pA0_7 +// 22 must save pA1_2, pA1_3 +// 23 must save pA1_6, pA1_7 +// 24 must save pB0_2, pB0_3 +// 25 must save pB0_6, pB0_7 +// 26 must save pB1_2, pB1_3 +// 27 must save pB1_6, pB1_7 // 28 must save // 29 frame // 30 link @@ -155,13 +154,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d3, [pA, #8] ldr d7, [pB, #8] - ldr x20, [pA], #16 + ldr x22, [pA], #16 fmul v16.4s, v0.4s, v4.s[0] - ldr x24, [pB], #16 + ldr x26, [pB], #16 fmul v17.4s, v1.4s, v4.s[0] - ldr x21, [pA], #8 + ldr x23, [pA], #8 fmul v18.4s, v0.4s, v4.s[1] - ldr x25, [pB], #8 + ldr x27, [pB], #8 fmul v19.4s, v1.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] fmul v21.4s, v1.4s, v4.s[2] @@ -179,21 +178,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x8_M1 ldr d2, [pA], #8 - fmov v0.d[1], x18 + fmov v0.d[1], x20 ldr d6, [pB], #8 - fmov v4.d[1], x22 + fmov v4.d[1], x24 ldr d3, [pA, #8] - fmov v1.d[1], x19 + fmov v1.d[1], x21 ldr d7, [pB, #8] - fmov v5.d[1], x23 + fmov v5.d[1], x25 fmla v16.4s, v0.4s, v4.s[0] - ldr x20, [pA], #16 + ldr x22, [pA], #16 fmla v17.4s, v1.4s, v4.s[0] - ldr x24, [pB], #16 + ldr x26, [pB], #16 fmla v18.4s, v0.4s, v4.s[1] - ldr x21, [pA], #8 + ldr x23, [pA], #8 fmla v19.4s, v1.4s, v4.s[1] - ldr x25, [pB], #8 + ldr x27, [pB], #8 fmla v20.4s, v0.4s, v4.s[2] fmla v21.4s, v1.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] @@ -210,21 +209,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x8_M2 ldr d0, [pA], #8 - fmov v2.d[1], x20 + fmov v2.d[1], x22 ldr d4, [pB], #8 - fmov v6.d[1], x24 + fmov v6.d[1], x26 ldr d1, [pA, #8] - fmov v3.d[1], x21 + fmov v3.d[1], x23 ldr d5, [pB, #8] - fmov v7.d[1], x25 + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] - ldr x18, [pA], #16 + ldr x20, [pA], #16 fmla v17.4s, v3.4s, v6.s[0] - ldr x22, [pB], #16 + ldr x24, [pB], #16 fmla v18.4s, v2.4s, v6.s[1] - ldr x19, [pA], #8 + ldr x21, [pA], #8 fmla v19.4s, v3.4s, v6.s[1] - ldr x23, [pB], #8 + ldr x25, [pB], #8 fmla v20.4s, v2.4s, v6.s[2] fmla v21.4s, v3.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] @@ -240,10 +239,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmov v2.d[1], x20 - fmov v6.d[1], x24 - fmov v3.d[1], x21 - fmov v7.d[1], x25 + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] fmla v17.4s, v3.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] @@ -363,67 +362,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_I - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 + ld1 {v0.4s}, [pA], #16 + ld1 {v4.4s, v5.4s}, [pB], #32 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d7, [pB, #8] + ldr x21, [pA], #8 fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 fmul v20.4s, v0.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] fmul v30.4s, v0.4s, v5.s[3] - - ld1 {v6.4s}, [pB] - add pB, pB, #16 - ld1 {v7.4s}, [pB] - add pB, pB, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 .endm .macro KERNEL4x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d7, [pB, #8] + fmov v5.d[1], x25 fmla v16.4s, v0.4s, v4.s[0] + ldr x21, [pA], #8 fmla v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #16 fmla v20.4s, v0.4s, v4.s[2] + ldr x27, [pB], #8 fmla v22.4s, v0.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] - - ld1 {v6.4s}, [pB] - add pB, pB, #16 - ld1 {v7.4s}, [pB] - add pB, pB, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 .endm .macro KERNEL4x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x21 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d5, [pB, #8] + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #8 fmla v18.4s, v2.4s, v6.s[1] + ldr x24, [pB], #16 fmla v20.4s, v2.4s, v6.s[2] + ldr x25, [pB], #8 fmla v22.4s, v2.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] - - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 .endm .macro KERNEL4x8_E + fmov v2.d[1], x21 + fmov v6.d[1], x26 + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] @@ -678,93 +679,90 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + ldr d9, [pB], #8 + ldr d2, [pA], #8 + ldr d3, [pA, #8] fmul v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 fmul v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 fmul v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmul v24.4s, v0.4s, v8.s[2] + fmul v25.4s, v1.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] + fmul v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL8x4_M1 + ldr d9, [pB], #8 + fmov v8.d[1], x24 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d3, [pA, #8] + fmov v1.d[1], x21 fmla v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 fmla v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 fmla v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.s[0] - fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldr d8, [pB], #8 + fmov v9.d[1], x25 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v9.s[0] + ldr x20, [pA], #16 + fmla v20.4s, v2.4s, v9.s[1] + ldr x21, [pA], #8 + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.s[0] - fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmov v9.d[1], x25 + fmov v2.d[1], x22 + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + fmla v17.4s, v3.4s, v9.s[0] + fmla v20.4s, v2.4s, v9.s[1] + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] .endm .macro SAVE8x4