From a0128aa489720ac2fd883dbeebfecffd4812ff99 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Tue, 24 Oct 2017 10:47:11 +0000 Subject: [PATCH] ARM64: Convert all labels to local labels While debugging/profiling applications using perf or other tools, the kernels appear scattered in the profile reports. This is because the labels within the kernels are not local and each label is shown as a separate function. To avoid this, all the labels within the kernels are changed to local labels. --- kernel/arm64/amax.S | 50 +- kernel/arm64/asum.S | 40 +- kernel/arm64/axpy.S | 42 +- kernel/arm64/casum.S | 40 +- kernel/arm64/cgemm_kernel_4x4.S | 284 +++++------ kernel/arm64/cgemm_kernel_8x4.S | 350 ++++++------- kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S | 350 ++++++------- kernel/arm64/copy.S | 40 +- kernel/arm64/ctrmm_kernel_4x4.S | 258 +++++----- kernel/arm64/ctrmm_kernel_8x4.S | 350 ++++++------- kernel/arm64/daxpy_thunderx2t99.S | 44 +- kernel/arm64/dgemm_kernel_4x4.S | 286 +++++------ kernel/arm64/dgemm_kernel_4x8.S | 352 ++++++------- kernel/arm64/dgemm_kernel_8x4.S | 338 ++++++------ kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S | 338 ++++++------ kernel/arm64/dgemm_ncopy_4.S | 72 +-- kernel/arm64/dgemm_ncopy_8.S | 96 ++-- kernel/arm64/dgemm_tcopy_4.S | 72 +-- kernel/arm64/dgemm_tcopy_8.S | 112 ++-- kernel/arm64/dot.S | 40 +- kernel/arm64/dtrmm_kernel_4x4.S | 258 +++++----- kernel/arm64/dtrmm_kernel_4x8.S | 352 ++++++------- kernel/arm64/dtrmm_kernel_8x4.S | 338 ++++++------ kernel/arm64/gemv_n.S | 62 +-- kernel/arm64/gemv_t.S | 62 +-- kernel/arm64/iamax.S | 48 +- kernel/arm64/izamax.S | 48 +- kernel/arm64/nrm2.S | 32 +- kernel/arm64/rot.S | 40 +- kernel/arm64/scal.S | 46 +- kernel/arm64/sgemm_kernel_16x4.S | 442 ++++++++-------- kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S | 442 ++++++++-------- kernel/arm64/sgemm_kernel_4x4.S | 310 +++++------ kernel/arm64/sgemm_kernel_8x8.S | 482 +++++++++--------- kernel/arm64/strmm_kernel_16x4.S | 442 ++++++++-------- kernel/arm64/strmm_kernel_4x4.S | 260 +++++----- kernel/arm64/strmm_kernel_8x8.S | 482 +++++++++--------- kernel/arm64/swap.S | 42 +- kernel/arm64/zamax.S | 50 +- kernel/arm64/zasum.S | 40 +- kernel/arm64/zaxpy.S | 42 +- kernel/arm64/zdot.S | 40 +- kernel/arm64/zgemm_kernel_4x4.S | 260 +++++----- kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S | 260 +++++----- kernel/arm64/zgemv_n.S | 52 +- kernel/arm64/zgemv_t.S | 52 +- kernel/arm64/znrm2.S | 32 +- kernel/arm64/zrot.S | 40 +- kernel/arm64/zscal.S | 68 +-- kernel/arm64/ztrmm_kernel_4x4.S | 260 +++++----- 50 files changed, 4469 insertions(+), 4469 deletions(-) diff --git a/kernel/arm64/amax.S b/kernel/arm64/amax.S index c02321ae0..f535ddf27 100644 --- a/kernel/arm64/amax.S +++ b/kernel/arm64/amax.S @@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble amax_kernel_zero + ble .Lamax_kernel_zero cmp INC_X, xzr - ble amax_kernel_zero + ble .Lamax_kernel_zero cmp INC_X, #1 - bne amax_kernel_S_BEGIN + bne .Lamax_kernel_S_BEGIN -amax_kernel_F_BEGIN: +.Lamax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq amax_kernel_F1_INIT + beq .Lamax_kernel_F1_INIT INIT_F4 subs I, I, #1 - beq amax_kernel_F1 + beq .Lamax_kernel_F1 -amax_kernel_F4: +.Lamax_kernel_F4: KERNEL_F4 subs I, I, #1 - bne amax_kernel_F4 + bne .Lamax_kernel_F4 -amax_kernel_F1: +.Lamax_kernel_F1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 -amax_kernel_F10: +.Lamax_kernel_F10: KERNEL_F1 subs I, I, #1 - bne amax_kernel_F10 + bne .Lamax_kernel_F10 ret -amax_kernel_F1_INIT: +.Lamax_kernel_F1_INIT: INIT_F1 subs N, N, #1 - b amax_kernel_F1 + b .Lamax_kernel_F1 -amax_kernel_S_BEGIN: +.Lamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble amax_kernel_S1 + ble .Lamax_kernel_S1 -amax_kernel_S4: +.Lamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -223,25 +223,25 @@ amax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S4 + bne .Lamax_kernel_S4 -amax_kernel_S1: +.Lamax_kernel_S1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 -amax_kernel_S10: +.Lamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S10 + bne .Lamax_kernel_S10 -amax_kernel_L999: +.Lamax_kernel_L999: ret -amax_kernel_zero: +.Lamax_kernel_zero: fmov MAXF, REG0 ret diff --git a/kernel/arm64/asum.S b/kernel/arm64/asum.S index bee8927b1..e88eb07c2 100644 --- a/kernel/arm64/asum.S +++ b/kernel/arm64/asum.S @@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble asum_kernel_L999 + ble .Lasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lasum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq asum_kernel_F1 + beq .Lasum_kernel_F1 -asum_kernel_F8: +.Lasum_kernel_F8: KERNEL_F8 subs I, I, #1 - bne asum_kernel_F8 + bne .Lasum_kernel_F8 KERNEL_F8_FINALIZE -asum_kernel_F1: +.Lasum_kernel_F1: ands I, N, #7 - ble asum_kernel_L999 + ble .Lasum_kernel_L999 -asum_kernel_F10: +.Lasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lasum_kernel_F10 -asum_kernel_L999: +.Lasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lasum_kernel_S1 -asum_kernel_S4: +.Lasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -175,19 +175,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lasum_kernel_S4 -asum_kernel_S1: +.Lasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lasum_kernel_L999 -asum_kernel_S10: +.Lasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lasum_kernel_S10 ret diff --git a/kernel/arm64/axpy.S b/kernel/arm64/axpy.S index 554902c09..809435110 100644 --- a/kernel/arm64/axpy.S +++ b/kernel/arm64/axpy.S @@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 fcmp DA, #0.0 - beq axpy_kernel_L999 + beq .Laxpy_kernel_L999 cmp INC_X, #1 - bne axpy_kernel_S_BEGIN + bne .Laxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne axpy_kernel_S_BEGIN + bne .Laxpy_kernel_S_BEGIN -axpy_kernel_F_BEGIN: +.Laxpy_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq axpy_kernel_F1 + beq .Laxpy_kernel_F1 -axpy_kernel_F8: +.Laxpy_kernel_F8: KERNEL_F8 subs I, I, #1 - bne axpy_kernel_F8 + bne .Laxpy_kernel_F8 -axpy_kernel_F1: +.Laxpy_kernel_F1: ands I, N, #7 - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 -axpy_kernel_F10: +.Laxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne axpy_kernel_F10 + bne .Laxpy_kernel_F10 mov w0, wzr ret -axpy_kernel_S_BEGIN: +.Laxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble axpy_kernel_S1 + ble .Laxpy_kernel_S1 -axpy_kernel_S4: +.Laxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -189,21 +189,21 @@ axpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S4 + bne .Laxpy_kernel_S4 -axpy_kernel_S1: +.Laxpy_kernel_S1: ands I, N, #3 - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 -axpy_kernel_S10: +.Laxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S10 + bne .Laxpy_kernel_S10 -axpy_kernel_L999: +.Laxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/casum.S b/kernel/arm64/casum.S index 8f09eecfa..7c82827a5 100644 --- a/kernel/arm64/casum.S +++ b/kernel/arm64/casum.S @@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov s1, SUMF cmp N, xzr - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lcasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lcasum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq asum_kernel_F1 + beq .Lcasum_kernel_F1 -asum_kernel_F8: +.Lcasum_kernel_F8: KERNEL_F8 subs I, I, #1 - bne asum_kernel_F8 + bne .Lcasum_kernel_F8 KERNEL_F8_FINALIZE -asum_kernel_F1: +.Lcasum_kernel_F1: ands I, N, #7 - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 -asum_kernel_F10: +.Lcasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lcasum_kernel_F10 -asum_kernel_L999: +.Lcasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lcasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lcasum_kernel_S1 -asum_kernel_S4: +.Lcasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -151,19 +151,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lcasum_kernel_S4 -asum_kernel_S1: +.Lcasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 -asum_kernel_S10: +.Lcasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lcasum_kernel_S10 ret diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S index 7f2ddea07..bbf0c7537 100644 --- a/kernel/arm64/cgemm_kernel_4x4.S +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array add ppA, temp, pA -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_40: +.Lcgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_42: +.Lcgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_42 + bgt .Lcgemm_kernel_L4_M4_42 -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M4_20 + bgt .Lcgemm_kernel_L2_M4_20 -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN: -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M4_20 + bgt .Lcgemm_kernel_L1_M4_20 -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index 5d1462808..24e08a646 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN .align 5 -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 .align 5 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 .align 5 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #7 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 .align 5 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne cgemm_kernel_L4_M8_46 + bne .Lcgemm_kernel_L4_M8_46 -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M4_32 + blt .Lcgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble cgemm_kernel_L4_M4_22a + ble .Lcgemm_kernel_L4_M4_22a .align 5 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_22a: +.Lcgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_32: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_32: tst counterL, #1 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_40: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_40: INIT4x4 -cgemm_kernel_L4_M4_44: +.Lcgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_46: +.Lcgemm_kernel_L4_M4_46: KERNEL4x4_SUB -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -cgemm_kernel_L2_M8_BEGIN: +.Lcgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L2_M4_BEGIN + ble .Lcgemm_kernel_L2_M4_BEGIN -cgemm_kernel_L2_M8_20: +.Lcgemm_kernel_L2_M8_20: INIT8x2 @@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M8_40 + ble .Lcgemm_kernel_L2_M8_40 .align 5 -cgemm_kernel_L2_M8_22: +.Lcgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_22 + bgt .Lcgemm_kernel_L2_M8_22 -cgemm_kernel_L2_M8_40: +.Lcgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M8_100 + ble .Lcgemm_kernel_L2_M8_100 -cgemm_kernel_L2_M8_42: +.Lcgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_42 + bgt .Lcgemm_kernel_L2_M8_42 -cgemm_kernel_L2_M8_100: +.Lcgemm_kernel_L2_M8_100: SAVE8x2 -cgemm_kernel_L2_M8_END: +.Lcgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M8_20 + bgt .Lcgemm_kernel_L2_M8_20 -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -cgemm_kernel_L1_M8_BEGIN: +.Lcgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L1_M4_BEGIN + ble .Lcgemm_kernel_L1_M4_BEGIN -cgemm_kernel_L1_M8_20: +.Lcgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M8_40 + ble .Lcgemm_kernel_L1_M8_40 .align 5 -cgemm_kernel_L1_M8_22: +.Lcgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_22 + bgt .Lcgemm_kernel_L1_M8_22 -cgemm_kernel_L1_M8_40: +.Lcgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M8_100 + ble .Lcgemm_kernel_L1_M8_100 -cgemm_kernel_L1_M8_42: +.Lcgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_42 + bgt .Lcgemm_kernel_L1_M8_42 -cgemm_kernel_L1_M8_100: +.Lcgemm_kernel_L1_M8_100: SAVE8x1 -cgemm_kernel_L1_M8_END: +.Lcgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M8_20 + bgt .Lcgemm_kernel_L1_M8_20 -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S index 367cd0217..29a68ff22 100644 --- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S @@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN .align 5 -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #5 // origK / 32 cmp counterL , #2 - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20: KERNEL8x4_M1_M2_x8 subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x16 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 .align 5 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 @@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 .align 5 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #31 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 .align 5 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne cgemm_kernel_L4_M8_46 + bne .Lcgemm_kernel_L4_M8_46 -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M4_32 + blt .Lcgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble cgemm_kernel_L4_M4_22a + ble .Lcgemm_kernel_L4_M4_22a .align 5 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_22a: +.Lcgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_32: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_32: tst counterL, #1 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_40: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_40: INIT4x4 -cgemm_kernel_L4_M4_44: +.Lcgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_46: +.Lcgemm_kernel_L4_M4_46: KERNEL4x4_SUB -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -cgemm_kernel_L2_M8_BEGIN: +.Lcgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L2_M4_BEGIN + ble .Lcgemm_kernel_L2_M4_BEGIN -cgemm_kernel_L2_M8_20: +.Lcgemm_kernel_L2_M8_20: INIT8x2 @@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M8_40 + ble .Lcgemm_kernel_L2_M8_40 .align 5 -cgemm_kernel_L2_M8_22: +.Lcgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_22 + bgt .Lcgemm_kernel_L2_M8_22 -cgemm_kernel_L2_M8_40: +.Lcgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M8_100 + ble .Lcgemm_kernel_L2_M8_100 -cgemm_kernel_L2_M8_42: +.Lcgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_42 + bgt .Lcgemm_kernel_L2_M8_42 -cgemm_kernel_L2_M8_100: +.Lcgemm_kernel_L2_M8_100: SAVE8x2 -cgemm_kernel_L2_M8_END: +.Lcgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M8_20 + bgt .Lcgemm_kernel_L2_M8_20 -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -cgemm_kernel_L1_M8_BEGIN: +.Lcgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L1_M4_BEGIN + ble .Lcgemm_kernel_L1_M4_BEGIN -cgemm_kernel_L1_M8_20: +.Lcgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M8_40 + ble .Lcgemm_kernel_L1_M8_40 .align 5 -cgemm_kernel_L1_M8_22: +.Lcgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_22 + bgt .Lcgemm_kernel_L1_M8_22 -cgemm_kernel_L1_M8_40: +.Lcgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M8_100 + ble .Lcgemm_kernel_L1_M8_100 -cgemm_kernel_L1_M8_42: +.Lcgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_42 + bgt .Lcgemm_kernel_L1_M8_42 -cgemm_kernel_L1_M8_100: +.Lcgemm_kernel_L1_M8_100: SAVE8x1 -cgemm_kernel_L1_M8_END: +.Lcgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M8_20 + bgt .Lcgemm_kernel_L1_M8_20 -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S index 70eab96fb..b8c6bfcd4 100644 --- a/kernel/arm64/copy.S +++ b/kernel/arm64/copy.S @@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 cmp INC_X, #1 - bne copy_kernel_S_BEGIN + bne .Lcopy_kernel_S_BEGIN cmp INC_Y, #1 - bne copy_kernel_S_BEGIN + bne .Lcopy_kernel_S_BEGIN -copy_kernel_F_BEGIN: +.Lcopy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq copy_kernel_F1 + beq .Lcopy_kernel_F1 -copy_kernel_F4: +.Lcopy_kernel_F4: KERNEL_F4 subs I, I, #1 - bne copy_kernel_F4 + bne .Lcopy_kernel_F4 -copy_kernel_F1: +.Lcopy_kernel_F1: ands I, N, #3 - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 -copy_kernel_F10: +.Lcopy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne copy_kernel_F10 + bne .Lcopy_kernel_F10 mov w0, wzr ret -copy_kernel_S_BEGIN: +.Lcopy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble copy_kernel_S1 + ble .Lcopy_kernel_S1 -copy_kernel_S4: +.Lcopy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -210,21 +210,21 @@ copy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne copy_kernel_S4 + bne .Lcopy_kernel_S4 -copy_kernel_S1: +.Lcopy_kernel_S1: ands I, N, #3 - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 -copy_kernel_S10: +.Lcopy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne copy_kernel_S10 + bne .Lcopy_kernel_S10 -copy_kernel_L999: +.Lcopy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S index 3de27257a..79d33e93c 100644 --- a/kernel/arm64/ctrmm_kernel_4x4.S +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ctrmm_kernel_L2_BEGIN + ble .Lctrmm_kernel_L2_BEGIN /******************************************************************************/ -ctrmm_kernel_L4_BEGIN: +.Lctrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ctrmm_kernel_L4_M4_BEGIN: +.Lctrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ctrmm_kernel_L4_M2_BEGIN + ble .Lctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: +.Lctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt ctrmm_kernel_L4_M4_32 + blt .Lctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble ctrmm_kernel_L4_M4_22a + ble .Lctrmm_kernel_L4_M4_22a .align 5 -ctrmm_kernel_L4_M4_22: +.Lctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_22 + bgt .Lctrmm_kernel_L4_M4_22 -ctrmm_kernel_L4_M4_22a: +.Lctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 + b .Lctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_32: +.Lctrmm_kernel_L4_M4_32: tst counterL, #1 - ble ctrmm_kernel_L4_M4_40 + ble .Lctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 + b .Lctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_40: +.Lctrmm_kernel_L4_M4_40: INIT4x4 -ctrmm_kernel_L4_M4_44: +.Lctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble ctrmm_kernel_L4_M4_100 + ble .Lctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_46: +.Lctrmm_kernel_L4_M4_46: KERNEL4x4_SUB -ctrmm_kernel_L4_M4_100: +.Lctrmm_kernel_L4_M4_100: SAVE4x4 @@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L4_M4_END: +.Lctrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne ctrmm_kernel_L4_M4_20 + bne .Lctrmm_kernel_L4_M4_20 -ctrmm_kernel_L4_M2_BEGIN: +.Lctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L4_M1_BEGIN + ble .Lctrmm_kernel_L4_M1_BEGIN -ctrmm_kernel_L4_M2_20: +.Lctrmm_kernel_L4_M2_20: INIT2x4 @@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M2_40 + ble .Lctrmm_kernel_L4_M2_40 -ctrmm_kernel_L4_M2_22: +.Lctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_22 + bgt .Lctrmm_kernel_L4_M2_22 -ctrmm_kernel_L4_M2_40: +.Lctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M2_100 + ble .Lctrmm_kernel_L4_M2_100 -ctrmm_kernel_L4_M2_42: +.Lctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_42 + bgt .Lctrmm_kernel_L4_M2_42 -ctrmm_kernel_L4_M2_100: +.Lctrmm_kernel_L4_M2_100: SAVE2x4 @@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L4_M2_END: +.Lctrmm_kernel_L4_M2_END: -ctrmm_kernel_L4_M1_BEGIN: +.Lctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END -ctrmm_kernel_L4_M1_20: +.Lctrmm_kernel_L4_M1_20: INIT1x4 @@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M1_40 + ble .Lctrmm_kernel_L4_M1_40 -ctrmm_kernel_L4_M1_22: +.Lctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_22 + bgt .Lctrmm_kernel_L4_M1_22 -ctrmm_kernel_L4_M1_40: +.Lctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M1_100 + ble .Lctrmm_kernel_L4_M1_100 -ctrmm_kernel_L4_M1_42: +.Lctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_42 + bgt .Lctrmm_kernel_L4_M1_42 -ctrmm_kernel_L4_M1_100: +.Lctrmm_kernel_L4_M1_100: SAVE1x4 @@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L4_END: +.Lctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ctrmm_kernel_L4_BEGIN + bgt .Lctrmm_kernel_L4_BEGIN /******************************************************************************/ -ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ctrmm_kernel_L999 // error, N was less than 4? + ble .Lctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ctrmm_kernel_L1_BEGIN + ble .Lctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -ctrmm_kernel_L2_M4_BEGIN: +.Lctrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble ctrmm_kernel_L2_M2_BEGIN + ble .Lctrmm_kernel_L2_M2_BEGIN -ctrmm_kernel_L2_M4_20: +.Lctrmm_kernel_L2_M4_20: INIT4x2 @@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M4_40 + ble .Lctrmm_kernel_L2_M4_40 .align 5 -ctrmm_kernel_L2_M4_22: +.Lctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_22 + bgt .Lctrmm_kernel_L2_M4_22 -ctrmm_kernel_L2_M4_40: +.Lctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M4_100 + ble .Lctrmm_kernel_L2_M4_100 -ctrmm_kernel_L2_M4_42: +.Lctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_42 + bgt .Lctrmm_kernel_L2_M4_42 -ctrmm_kernel_L2_M4_100: +.Lctrmm_kernel_L2_M4_100: SAVE4x2 @@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L2_M4_END: +.Lctrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L2_M4_20 + bgt .Lctrmm_kernel_L2_M4_20 -ctrmm_kernel_L2_M2_BEGIN: +.Lctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M1_BEGIN + ble .Lctrmm_kernel_L2_M1_BEGIN -ctrmm_kernel_L2_M2_20: +.Lctrmm_kernel_L2_M2_20: INIT2x2 @@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M2_40 + ble .Lctrmm_kernel_L2_M2_40 -ctrmm_kernel_L2_M2_22: +.Lctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_22 + bgt .Lctrmm_kernel_L2_M2_22 -ctrmm_kernel_L2_M2_40: +.Lctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M2_100 + ble .Lctrmm_kernel_L2_M2_100 -ctrmm_kernel_L2_M2_42: +.Lctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_42 + bgt .Lctrmm_kernel_L2_M2_42 -ctrmm_kernel_L2_M2_100: +.Lctrmm_kernel_L2_M2_100: SAVE2x2 @@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L2_M2_END: +.Lctrmm_kernel_L2_M2_END: -ctrmm_kernel_L2_M1_BEGIN: +.Lctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END -ctrmm_kernel_L2_M1_20: +.Lctrmm_kernel_L2_M1_20: INIT1x2 @@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ctrmm_kernel_L2_M1_40 + ble .Lctrmm_kernel_L2_M1_40 -ctrmm_kernel_L2_M1_22: +.Lctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_22 + bgt .Lctrmm_kernel_L2_M1_22 -ctrmm_kernel_L2_M1_40: +.Lctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M1_100 + ble .Lctrmm_kernel_L2_M1_100 -ctrmm_kernel_L2_M1_42: +.Lctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_42 + bgt .Lctrmm_kernel_L2_M1_42 -ctrmm_kernel_L2_M1_100: +.Lctrmm_kernel_L2_M1_100: SAVE1x2 @@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L2_END: +.Lctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END: /******************************************************************************/ -ctrmm_kernel_L1_BEGIN: +.Lctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ctrmm_kernel_L999 // done + ble .Lctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -ctrmm_kernel_L1_M4_BEGIN: +.Lctrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ctrmm_kernel_L1_M2_BEGIN + ble .Lctrmm_kernel_L1_M2_BEGIN -ctrmm_kernel_L1_M4_20: +.Lctrmm_kernel_L1_M4_20: INIT4x1 @@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M4_40 + ble .Lctrmm_kernel_L1_M4_40 .align 5 -ctrmm_kernel_L1_M4_22: +.Lctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_22 + bgt .Lctrmm_kernel_L1_M4_22 -ctrmm_kernel_L1_M4_40: +.Lctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M4_100 + ble .Lctrmm_kernel_L1_M4_100 -ctrmm_kernel_L1_M4_42: +.Lctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_42 + bgt .Lctrmm_kernel_L1_M4_42 -ctrmm_kernel_L1_M4_100: +.Lctrmm_kernel_L1_M4_100: SAVE4x1 @@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L1_M4_END: +.Lctrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L1_M4_20 + bgt .Lctrmm_kernel_L1_M4_20 -ctrmm_kernel_L1_M2_BEGIN: +.Lctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M1_BEGIN + ble .Lctrmm_kernel_L1_M1_BEGIN -ctrmm_kernel_L1_M2_20: +.Lctrmm_kernel_L1_M2_20: INIT2x1 @@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M2_40 + ble .Lctrmm_kernel_L1_M2_40 -ctrmm_kernel_L1_M2_22: +.Lctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_22 + bgt .Lctrmm_kernel_L1_M2_22 -ctrmm_kernel_L1_M2_40: +.Lctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M2_100 + ble .Lctrmm_kernel_L1_M2_100 -ctrmm_kernel_L1_M2_42: +.Lctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_42 + bgt .Lctrmm_kernel_L1_M2_42 -ctrmm_kernel_L1_M2_100: +.Lctrmm_kernel_L1_M2_100: SAVE2x1 @@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L1_M2_END: +.Lctrmm_kernel_L1_M2_END: -ctrmm_kernel_L1_M1_BEGIN: +.Lctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END -ctrmm_kernel_L1_M1_20: +.Lctrmm_kernel_L1_M1_20: INIT1x1 @@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M1_40 + ble .Lctrmm_kernel_L1_M1_40 -ctrmm_kernel_L1_M1_22: +.Lctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_22 + bgt .Lctrmm_kernel_L1_M1_22 -ctrmm_kernel_L1_M1_40: +.Lctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M1_100 + ble .Lctrmm_kernel_L1_M1_100 -ctrmm_kernel_L1_M1_42: +.Lctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_42 + bgt .Lctrmm_kernel_L1_M1_42 -ctrmm_kernel_L1_M1_100: +.Lctrmm_kernel_L1_M1_100: SAVE1x1 -ctrmm_kernel_L1_END: +.Lctrmm_kernel_L1_END: -ctrmm_kernel_L999: +.Lctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index 680fb56c3..5c0827397 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ctrmm_kernel_L2_BEGIN + ble .Lctrmm_kernel_L2_BEGIN /******************************************************************************/ -ctrmm_kernel_L4_BEGIN: +.Lctrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ctrmm_kernel_L4_M8_BEGIN: +.Lctrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L4_M4_BEGIN + ble .Lctrmm_kernel_L4_M4_BEGIN -ctrmm_kernel_L4_M8_20: +.Lctrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20: asr counterL , tempK, #3 cmp counterL , #2 - blt ctrmm_kernel_L4_M8_32 + blt .Lctrmm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble ctrmm_kernel_L4_M8_22a + ble .Lctrmm_kernel_L4_M8_22a .align 5 -ctrmm_kernel_L4_M8_22: +.Lctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M8_22 + bgt .Lctrmm_kernel_L4_M8_22 .align 5 -ctrmm_kernel_L4_M8_22a: +.Lctrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b ctrmm_kernel_L4_M8_44 + b .Lctrmm_kernel_L4_M8_44 .align 5 -ctrmm_kernel_L4_M8_32: +.Lctrmm_kernel_L4_M8_32: tst counterL, #1 - ble ctrmm_kernel_L4_M8_40 + ble .Lctrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b ctrmm_kernel_L4_M8_44 + b .Lctrmm_kernel_L4_M8_44 -ctrmm_kernel_L4_M8_40: +.Lctrmm_kernel_L4_M8_40: INIT8x4 -ctrmm_kernel_L4_M8_44: +.Lctrmm_kernel_L4_M8_44: ands counterL , tempK, #7 - ble ctrmm_kernel_L4_M8_100 + ble .Lctrmm_kernel_L4_M8_100 .align 5 -ctrmm_kernel_L4_M8_46: +.Lctrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne ctrmm_kernel_L4_M8_46 + bne .Lctrmm_kernel_L4_M8_46 -ctrmm_kernel_L4_M8_100: +.Lctrmm_kernel_L4_M8_100: SAVE8x4 @@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -ctrmm_kernel_L4_M8_END: +.Lctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne ctrmm_kernel_L4_M8_20 + bne .Lctrmm_kernel_L4_M8_20 -ctrmm_kernel_L4_M4_BEGIN: +.Lctrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #4 - ble ctrmm_kernel_L4_M2_BEGIN + ble .Lctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: +.Lctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt ctrmm_kernel_L4_M4_32 + blt .Lctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble ctrmm_kernel_L4_M4_22a + ble .Lctrmm_kernel_L4_M4_22a .align 5 -ctrmm_kernel_L4_M4_22: +.Lctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_22 + bgt .Lctrmm_kernel_L4_M4_22 -ctrmm_kernel_L4_M4_22a: +.Lctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_32: + b .Lctrmm_kernel_L4_M4_44 +.Lctrmm_kernel_L4_M4_32: tst counterL, #1 - ble ctrmm_kernel_L4_M4_40 + ble .Lctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_40: + b .Lctrmm_kernel_L4_M4_44 +.Lctrmm_kernel_L4_M4_40: INIT4x4 -ctrmm_kernel_L4_M4_44: +.Lctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble ctrmm_kernel_L4_M4_100 + ble .Lctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_46: +.Lctrmm_kernel_L4_M4_46: KERNEL4x4_SUB -ctrmm_kernel_L4_M4_100: +.Lctrmm_kernel_L4_M4_100: SAVE4x4 @@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L4_M4_END: +.Lctrmm_kernel_L4_M4_END: -ctrmm_kernel_L4_M2_BEGIN: +.Lctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L4_M1_BEGIN + ble .Lctrmm_kernel_L4_M1_BEGIN -ctrmm_kernel_L4_M2_20: +.Lctrmm_kernel_L4_M2_20: INIT2x4 @@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M2_40 + ble .Lctrmm_kernel_L4_M2_40 -ctrmm_kernel_L4_M2_22: +.Lctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_22 + bgt .Lctrmm_kernel_L4_M2_22 -ctrmm_kernel_L4_M2_40: +.Lctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M2_100 + ble .Lctrmm_kernel_L4_M2_100 -ctrmm_kernel_L4_M2_42: +.Lctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_42 + bgt .Lctrmm_kernel_L4_M2_42 -ctrmm_kernel_L4_M2_100: +.Lctrmm_kernel_L4_M2_100: SAVE2x4 @@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L4_M2_END: +.Lctrmm_kernel_L4_M2_END: -ctrmm_kernel_L4_M1_BEGIN: +.Lctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END -ctrmm_kernel_L4_M1_20: +.Lctrmm_kernel_L4_M1_20: INIT1x4 @@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M1_40 + ble .Lctrmm_kernel_L4_M1_40 -ctrmm_kernel_L4_M1_22: +.Lctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_22 + bgt .Lctrmm_kernel_L4_M1_22 -ctrmm_kernel_L4_M1_40: +.Lctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M1_100 + ble .Lctrmm_kernel_L4_M1_100 -ctrmm_kernel_L4_M1_42: +.Lctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_42 + bgt .Lctrmm_kernel_L4_M1_42 -ctrmm_kernel_L4_M1_100: +.Lctrmm_kernel_L4_M1_100: SAVE1x4 @@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L4_END: +.Lctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ctrmm_kernel_L4_BEGIN + bgt .Lctrmm_kernel_L4_BEGIN /******************************************************************************/ -ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ctrmm_kernel_L999 // error, N was less than 4? + ble .Lctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ctrmm_kernel_L1_BEGIN + ble .Lctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -ctrmm_kernel_L2_M8_BEGIN: +.Lctrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L2_M4_BEGIN + ble .Lctrmm_kernel_L2_M4_BEGIN -ctrmm_kernel_L2_M8_20: +.Lctrmm_kernel_L2_M8_20: INIT8x2 @@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M8_40 + ble .Lctrmm_kernel_L2_M8_40 .align 5 -ctrmm_kernel_L2_M8_22: +.Lctrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M8_22 + bgt .Lctrmm_kernel_L2_M8_22 -ctrmm_kernel_L2_M8_40: +.Lctrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M8_100 + ble .Lctrmm_kernel_L2_M8_100 -ctrmm_kernel_L2_M8_42: +.Lctrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M8_42 + bgt .Lctrmm_kernel_L2_M8_42 -ctrmm_kernel_L2_M8_100: +.Lctrmm_kernel_L2_M8_100: SAVE8x2 @@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -ctrmm_kernel_L2_M8_END: +.Lctrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L2_M8_20 + bgt .Lctrmm_kernel_L2_M8_20 -ctrmm_kernel_L2_M4_BEGIN: +.Lctrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M2_BEGIN + ble .Lctrmm_kernel_L2_M2_BEGIN -ctrmm_kernel_L2_M4_20: +.Lctrmm_kernel_L2_M4_20: INIT4x2 @@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M4_40 + ble .Lctrmm_kernel_L2_M4_40 .align 5 -ctrmm_kernel_L2_M4_22: +.Lctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_22 + bgt .Lctrmm_kernel_L2_M4_22 -ctrmm_kernel_L2_M4_40: +.Lctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M4_100 + ble .Lctrmm_kernel_L2_M4_100 -ctrmm_kernel_L2_M4_42: +.Lctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_42 + bgt .Lctrmm_kernel_L2_M4_42 -ctrmm_kernel_L2_M4_100: +.Lctrmm_kernel_L2_M4_100: SAVE4x2 @@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L2_M4_END: +.Lctrmm_kernel_L2_M4_END: -ctrmm_kernel_L2_M2_BEGIN: +.Lctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M1_BEGIN + ble .Lctrmm_kernel_L2_M1_BEGIN -ctrmm_kernel_L2_M2_20: +.Lctrmm_kernel_L2_M2_20: INIT2x2 @@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M2_40 + ble .Lctrmm_kernel_L2_M2_40 -ctrmm_kernel_L2_M2_22: +.Lctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_22 + bgt .Lctrmm_kernel_L2_M2_22 -ctrmm_kernel_L2_M2_40: +.Lctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M2_100 + ble .Lctrmm_kernel_L2_M2_100 -ctrmm_kernel_L2_M2_42: +.Lctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_42 + bgt .Lctrmm_kernel_L2_M2_42 -ctrmm_kernel_L2_M2_100: +.Lctrmm_kernel_L2_M2_100: SAVE2x2 @@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L2_M2_END: +.Lctrmm_kernel_L2_M2_END: -ctrmm_kernel_L2_M1_BEGIN: +.Lctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END -ctrmm_kernel_L2_M1_20: +.Lctrmm_kernel_L2_M1_20: INIT1x2 @@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ctrmm_kernel_L2_M1_40 + ble .Lctrmm_kernel_L2_M1_40 -ctrmm_kernel_L2_M1_22: +.Lctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_22 + bgt .Lctrmm_kernel_L2_M1_22 -ctrmm_kernel_L2_M1_40: +.Lctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M1_100 + ble .Lctrmm_kernel_L2_M1_100 -ctrmm_kernel_L2_M1_42: +.Lctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_42 + bgt .Lctrmm_kernel_L2_M1_42 -ctrmm_kernel_L2_M1_100: +.Lctrmm_kernel_L2_M1_100: SAVE1x2 @@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L2_END: +.Lctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END: /******************************************************************************/ -ctrmm_kernel_L1_BEGIN: +.Lctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ctrmm_kernel_L999 // done + ble .Lctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next @@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -ctrmm_kernel_L1_M8_BEGIN: +.Lctrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L1_M4_BEGIN + ble .Lctrmm_kernel_L1_M4_BEGIN -ctrmm_kernel_L1_M8_20: +.Lctrmm_kernel_L1_M8_20: INIT8x1 @@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M8_40 + ble .Lctrmm_kernel_L1_M8_40 .align 5 -ctrmm_kernel_L1_M8_22: +.Lctrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M8_22 + bgt .Lctrmm_kernel_L1_M8_22 -ctrmm_kernel_L1_M8_40: +.Lctrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M8_100 + ble .Lctrmm_kernel_L1_M8_100 -ctrmm_kernel_L1_M8_42: +.Lctrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M8_42 + bgt .Lctrmm_kernel_L1_M8_42 -ctrmm_kernel_L1_M8_100: +.Lctrmm_kernel_L1_M8_100: SAVE8x1 @@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -ctrmm_kernel_L1_M8_END: +.Lctrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L1_M8_20 + bgt .Lctrmm_kernel_L1_M8_20 -ctrmm_kernel_L1_M4_BEGIN: +.Lctrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M2_BEGIN + ble .Lctrmm_kernel_L1_M2_BEGIN -ctrmm_kernel_L1_M4_20: +.Lctrmm_kernel_L1_M4_20: INIT4x1 @@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M4_40 + ble .Lctrmm_kernel_L1_M4_40 .align 5 -ctrmm_kernel_L1_M4_22: +.Lctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_22 + bgt .Lctrmm_kernel_L1_M4_22 -ctrmm_kernel_L1_M4_40: +.Lctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M4_100 + ble .Lctrmm_kernel_L1_M4_100 -ctrmm_kernel_L1_M4_42: +.Lctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_42 + bgt .Lctrmm_kernel_L1_M4_42 -ctrmm_kernel_L1_M4_100: +.Lctrmm_kernel_L1_M4_100: SAVE4x1 @@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L1_M4_END: +.Lctrmm_kernel_L1_M4_END: -ctrmm_kernel_L1_M2_BEGIN: +.Lctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M1_BEGIN + ble .Lctrmm_kernel_L1_M1_BEGIN -ctrmm_kernel_L1_M2_20: +.Lctrmm_kernel_L1_M2_20: INIT2x1 @@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M2_40 + ble .Lctrmm_kernel_L1_M2_40 -ctrmm_kernel_L1_M2_22: +.Lctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_22 + bgt .Lctrmm_kernel_L1_M2_22 -ctrmm_kernel_L1_M2_40: +.Lctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M2_100 + ble .Lctrmm_kernel_L1_M2_100 -ctrmm_kernel_L1_M2_42: +.Lctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_42 + bgt .Lctrmm_kernel_L1_M2_42 -ctrmm_kernel_L1_M2_100: +.Lctrmm_kernel_L1_M2_100: SAVE2x1 @@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L1_M2_END: +.Lctrmm_kernel_L1_M2_END: -ctrmm_kernel_L1_M1_BEGIN: +.Lctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END -ctrmm_kernel_L1_M1_20: +.Lctrmm_kernel_L1_M1_20: INIT1x1 @@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M1_40 + ble .Lctrmm_kernel_L1_M1_40 -ctrmm_kernel_L1_M1_22: +.Lctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_22 + bgt .Lctrmm_kernel_L1_M1_22 -ctrmm_kernel_L1_M1_40: +.Lctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M1_100 + ble .Lctrmm_kernel_L1_M1_100 -ctrmm_kernel_L1_M1_42: +.Lctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_42 + bgt .Lctrmm_kernel_L1_M1_42 -ctrmm_kernel_L1_M1_100: +.Lctrmm_kernel_L1_M1_100: SAVE1x1 -ctrmm_kernel_L1_END: +.Lctrmm_kernel_L1_END: -ctrmm_kernel_L999: +.Lctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S index 5eb2ec0c3..b8d0af5c2 100644 --- a/kernel/arm64/daxpy_thunderx2t99.S +++ b/kernel/arm64/daxpy_thunderx2t99.S @@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 fcmp DA, #0.0 - beq axpy_kernel_L999 + beq .Ldaxpy_kernel_L999 cmp INC_X, #1 - bne axpy_kernel_S_BEGIN + bne .Ldaxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne axpy_kernel_S_BEGIN + bne .Ldaxpy_kernel_S_BEGIN -axpy_kernel_F_BEGIN: +.Ldaxpy_kernel_F_BEGIN: asr I, N, #5 cmp I, xzr - beq axpy_kernel_F1 + beq .Ldaxpy_kernel_F1 .align 5 -axpy_kernel_F32: +.Ldaxpy_kernel_F32: KERNEL_F32 subs I, I, #1 - bne axpy_kernel_F32 + bne .Ldaxpy_kernel_F32 -axpy_kernel_F1: +.Ldaxpy_kernel_F1: ands I, N, #31 - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 -axpy_kernel_F10: +.Ldaxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne axpy_kernel_F10 + bne .Ldaxpy_kernel_F10 - b axpy_kernel_L999 + b .Ldaxpy_kernel_L999 -axpy_kernel_S_BEGIN: +.Ldaxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble axpy_kernel_S1 + ble .Ldaxpy_kernel_S1 -axpy_kernel_S4: +.Ldaxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -176,21 +176,21 @@ axpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S4 + bne .Ldaxpy_kernel_S4 -axpy_kernel_S1: +.Ldaxpy_kernel_S1: ands I, N, #3 - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 -axpy_kernel_S10: +.Ldaxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S10 + bne .Ldaxpy_kernel_S10 -axpy_kernel_L999: +.Ldaxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S index 44b0f7ff2..349167062 100644 --- a/kernel/arm64/dgemm_kernel_4x4.S +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN: //------------------------------------------------------------------------------ -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #2 // L = K / 4 cmp counterL , #2 - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #3 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: lsl temp, origK, #5 prfm PLDL1KEEP, [pA, temp] prfm PLDL1KEEP, [ppA, temp] @@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100: SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M4_20 + bgt .Ldgemm_kernel_L2_M4_20 -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN: -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M4_20 + bgt .Ldgemm_kernel_L1_M4_20 -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S index b04dbb5d5..ced26b49c 100644 --- a/kernel/arm64/dgemm_kernel_4x8.S +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble dgemm_kernel_L4_BEGIN + ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L8_BEGIN: +.Ldgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 mov pA, origPA // pA = start of A array -dgemm_kernel_L8_M4_BEGIN: +.Ldgemm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L8_M2_BEGIN + ble .Ldgemm_kernel_L8_M2_BEGIN -dgemm_kernel_L8_M4_20: +.Ldgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L8_M4_32 + blt .Ldgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble dgemm_kernel_L8_M4_22a + ble .Ldgemm_kernel_L8_M4_22a .align 5 -dgemm_kernel_L8_M4_22: +.Ldgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M4_22 + bgt .Ldgemm_kernel_L8_M4_22 -dgemm_kernel_L8_M4_22a: +.Ldgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b dgemm_kernel_L8_M4_44 + b .Ldgemm_kernel_L8_M4_44 -dgemm_kernel_L8_M4_32: +.Ldgemm_kernel_L8_M4_32: tst counterL, #1 - ble dgemm_kernel_L8_M4_40 + ble .Ldgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b dgemm_kernel_L8_M4_44 + b .Ldgemm_kernel_L8_M4_44 -dgemm_kernel_L8_M4_40: +.Ldgemm_kernel_L8_M4_40: INIT4x8 -dgemm_kernel_L8_M4_44: +.Ldgemm_kernel_L8_M4_44: ands counterL , origK, #1 - ble dgemm_kernel_L8_M4_100 + ble .Ldgemm_kernel_L8_M4_100 -dgemm_kernel_L8_M4_46: +.Ldgemm_kernel_L8_M4_46: KERNEL4x8_SUB -dgemm_kernel_L8_M4_100: +.Ldgemm_kernel_L8_M4_100: SAVE4x8 -dgemm_kernel_L8_M4_END: +.Ldgemm_kernel_L8_M4_END: subs counterI, counterI, #1 - bne dgemm_kernel_L8_M4_20 + bne .Ldgemm_kernel_L8_M4_20 -dgemm_kernel_L8_M2_BEGIN: +.Ldgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L8_END + ble .Ldgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L8_M1_BEGIN + ble .Ldgemm_kernel_L8_M1_BEGIN -dgemm_kernel_L8_M2_20: +.Ldgemm_kernel_L8_M2_20: INIT2x8 @@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L8_M2_40 + ble .Ldgemm_kernel_L8_M2_40 -dgemm_kernel_L8_M2_22: +.Ldgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M2_22 + bgt .Ldgemm_kernel_L8_M2_22 -dgemm_kernel_L8_M2_40: +.Ldgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L8_M2_100 + ble .Ldgemm_kernel_L8_M2_100 -dgemm_kernel_L8_M2_42: +.Ldgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M2_42 + bgt .Ldgemm_kernel_L8_M2_42 -dgemm_kernel_L8_M2_100: +.Ldgemm_kernel_L8_M2_100: SAVE2x8 -dgemm_kernel_L8_M2_END: +.Ldgemm_kernel_L8_M2_END: -dgemm_kernel_L8_M1_BEGIN: +.Ldgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L8_END + ble .Ldgemm_kernel_L8_END -dgemm_kernel_L8_M1_20: +.Ldgemm_kernel_L8_M1_20: INIT1x8 @@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L8_M1_40 + ble .Ldgemm_kernel_L8_M1_40 -dgemm_kernel_L8_M1_22: +.Ldgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M1_22 + bgt .Ldgemm_kernel_L8_M1_22 -dgemm_kernel_L8_M1_40: +.Ldgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L8_M1_100 + ble .Ldgemm_kernel_L8_M1_100 -dgemm_kernel_L8_M1_42: +.Ldgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M1_42 + bgt .Ldgemm_kernel_L8_M1_42 -dgemm_kernel_L8_M1_100: +.Ldgemm_kernel_L8_M1_100: SAVE1x8 -dgemm_kernel_L8_END: +.Ldgemm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L8_BEGIN + bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble dgemm_kernel_L999 + ble .Ldgemm_kernel_L999 tst counterJ , #4 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M4_32 + blt .Ldgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dgemm_kernel_L4_M4_22a + ble .Ldgemm_kernel_L4_M4_22a .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_22a: +.Ldgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dgemm_kernel_L4_M4_44 + b .Ldgemm_kernel_L4_M4_44 -dgemm_kernel_L4_M4_32: +.Ldgemm_kernel_L4_M4_32: tst counterL, #1 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dgemm_kernel_L4_M4_44 + b .Ldgemm_kernel_L4_M4_44 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: INIT4x4 -dgemm_kernel_L4_M4_44: +.Ldgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_46: +.Ldgemm_kernel_L4_M4_46: KERNEL4x4_SUB -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M4_20 + bne .Ldgemm_kernel_L4_M4_20 -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M4_20 + bgt .Ldgemm_kernel_L2_M4_20 -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M4_20 + bgt .Ldgemm_kernel_L1_M4_20 -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S index 3fd74fc3b..af3aa0217 100644 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #7 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 @@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22: prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 .align 5 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 .align 5 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB @@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M8_BEGIN: +.Ldgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L2_M4_BEGIN + ble .Ldgemm_kernel_L2_M4_BEGIN .align 5 -dgemm_kernel_L2_M8_20: +.Ldgemm_kernel_L2_M8_20: INIT8x2 @@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M8_40 + ble .Ldgemm_kernel_L2_M8_40 .align 5 -dgemm_kernel_L2_M8_22: +.Ldgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_22 + bgt .Ldgemm_kernel_L2_M8_22 -dgemm_kernel_L2_M8_40: +.Ldgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M8_100 + ble .Ldgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M8_42: +.Ldgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_42 + bgt .Ldgemm_kernel_L2_M8_42 -dgemm_kernel_L2_M8_100: +.Ldgemm_kernel_L2_M8_100: SAVE8x2 -dgemm_kernel_L2_M8_END: +.Ldgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M8_20 + bgt .Ldgemm_kernel_L2_M8_20 -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB @@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A -dgemm_kernel_L1_M8_BEGIN: +.Ldgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L1_M4_BEGIN + ble .Ldgemm_kernel_L1_M4_BEGIN .align 5 -dgemm_kernel_L1_M8_20: +.Ldgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M8_40 + ble .Ldgemm_kernel_L1_M8_40 .align 5 -dgemm_kernel_L1_M8_22: +.Ldgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_22 + bgt .Ldgemm_kernel_L1_M8_22 -dgemm_kernel_L1_M8_40: +.Ldgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M8_100 + ble .Ldgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M8_42: +.Ldgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_42 + bgt .Ldgemm_kernel_L1_M8_42 -dgemm_kernel_L1_M8_100: +.Ldgemm_kernel_L1_M8_100: SAVE8x1 -dgemm_kernel_L1_M8_END: +.Ldgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M8_20 + bgt .Ldgemm_kernel_L1_M8_20 -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB @@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] @@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S index 86865d825..598db6e0c 100644 --- a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S @@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #7 // L = K / 128 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M1_M2_x1 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x64 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x16 @@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #127 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] @@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100: SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 @@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22: prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 .align 5 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 .align 5 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x4_SUB @@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M8_BEGIN: +.Ldgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L2_M4_BEGIN + ble .Ldgemm_kernel_L2_M4_BEGIN .align 5 -dgemm_kernel_L2_M8_20: +.Ldgemm_kernel_L2_M8_20: INIT8x2 @@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M8_40 + ble .Ldgemm_kernel_L2_M8_40 .align 5 -dgemm_kernel_L2_M8_22: +.Ldgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_22 + bgt .Ldgemm_kernel_L2_M8_22 -dgemm_kernel_L2_M8_40: +.Ldgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M8_100 + ble .Ldgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M8_42: +.Ldgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_42 + bgt .Ldgemm_kernel_L2_M8_42 -dgemm_kernel_L2_M8_100: +.Ldgemm_kernel_L2_M8_100: SAVE8x2 -dgemm_kernel_L2_M8_END: +.Ldgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M8_20 + bgt .Ldgemm_kernel_L2_M8_20 -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x2_SUB @@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A -dgemm_kernel_L1_M8_BEGIN: +.Ldgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L1_M4_BEGIN + ble .Ldgemm_kernel_L1_M4_BEGIN .align 5 -dgemm_kernel_L1_M8_20: +.Ldgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M8_40 + ble .Ldgemm_kernel_L1_M8_40 .align 5 -dgemm_kernel_L1_M8_22: +.Ldgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_22 + bgt .Ldgemm_kernel_L1_M8_22 -dgemm_kernel_L1_M8_40: +.Ldgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M8_100 + ble .Ldgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M8_42: +.Ldgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_42 + bgt .Ldgemm_kernel_L1_M8_42 -dgemm_kernel_L1_M8_100: +.Ldgemm_kernel_L1_M8_100: SAVE8x1 -dgemm_kernel_L1_M8_END: +.Ldgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M8_20 + bgt .Ldgemm_kernel_L1_M8_20 -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x1_SUB @@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] @@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_ncopy_4.S b/kernel/arm64/dgemm_ncopy_4.S index c98a73277..29d274d93 100644 --- a/kernel/arm64/dgemm_ncopy_4.S +++ b/kernel/arm64/dgemm_ncopy_4.S @@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl LDA, LDA, #3 // LDA = LDA * SIZE -dgemm_ncopy_L4_BEGIN: +.Ldgemm_ncopy_L4_BEGIN: asr J, N, #2 // J = N / 4 cmp J, #0 - ble dgemm_ncopy_L2_BEGIN + ble .Ldgemm_ncopy_L2_BEGIN .align 5 -dgemm_ncopy_L4_M4_BEGIN: +.Ldgemm_ncopy_L4_M4_BEGIN: mov A01, A00 add A02, A01, LDA @@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN: asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L4_M4_40 + ble .Ldgemm_ncopy_L4_M4_40 .align 5 -dgemm_ncopy_L4_M4_20: +.Ldgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne dgemm_ncopy_L4_M4_20 + bne .Ldgemm_ncopy_L4_M4_20 -dgemm_ncopy_L4_M4_40: +.Ldgemm_ncopy_L4_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L4_M4_END + ble .Ldgemm_ncopy_L4_M4_END .align 5 -dgemm_ncopy_L4_M4_60: +.Ldgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne dgemm_ncopy_L4_M4_60 + bne .Ldgemm_ncopy_L4_M4_60 -dgemm_ncopy_L4_M4_END: +.Ldgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne dgemm_ncopy_L4_M4_BEGIN + bne .Ldgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -dgemm_ncopy_L2_BEGIN: +.Ldgemm_ncopy_L2_BEGIN: tst N, #3 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #2 - ble dgemm_ncopy_L1_BEGIN + ble .Ldgemm_ncopy_L1_BEGIN -dgemm_ncopy_L2_M4_BEGIN: +.Ldgemm_ncopy_L2_M4_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L2_M4_40 + ble .Ldgemm_ncopy_L2_M4_40 .align 5 -dgemm_ncopy_L2_M4_20: +.Ldgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne dgemm_ncopy_L2_M4_20 + bne .Ldgemm_ncopy_L2_M4_20 -dgemm_ncopy_L2_M4_40: +.Ldgemm_ncopy_L2_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L2_M4_END + ble .Ldgemm_ncopy_L2_M4_END .align 5 -dgemm_ncopy_L2_M4_60: +.Ldgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne dgemm_ncopy_L2_M4_60 + bne .Ldgemm_ncopy_L2_M4_60 -dgemm_ncopy_L2_M4_END: +.Ldgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -dgemm_ncopy_L1_BEGIN: +.Ldgemm_ncopy_L1_BEGIN: tst N, #1 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 -dgemm_ncopy_L1_M4_BEGIN: +.Ldgemm_ncopy_L1_M4_BEGIN: mov A01, A00 asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L1_M4_40 + ble .Ldgemm_ncopy_L1_M4_40 .align 5 -dgemm_ncopy_L1_M4_20: +.Ldgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne dgemm_ncopy_L1_M4_20 + bne .Ldgemm_ncopy_L1_M4_20 -dgemm_ncopy_L1_M4_40: +.Ldgemm_ncopy_L1_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L1_M4_END + ble .Ldgemm_ncopy_L1_M4_END .align 5 -dgemm_ncopy_L1_M4_60: +.Ldgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne dgemm_ncopy_L1_M4_60 + bne .Ldgemm_ncopy_L1_M4_60 -dgemm_ncopy_L1_M4_END: +.Ldgemm_ncopy_L1_M4_END: -dgemm_ncopy_L999: +.Ldgemm_ncopy_L999: mov x0, #0 RESTORE_REGS diff --git a/kernel/arm64/dgemm_ncopy_8.S b/kernel/arm64/dgemm_ncopy_8.S index 1f237b42c..366424830 100644 --- a/kernel/arm64/dgemm_ncopy_8.S +++ b/kernel/arm64/dgemm_ncopy_8.S @@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl LDA, LDA, #3 // LDA = LDA * SIZE -dgemm_ncopy_L8_BEGIN: +.Ldgemm_ncopy_L8_BEGIN: asr J, N, #3 // J = N / 8 cmp J, #0 - ble dgemm_ncopy_L4_BEGIN + ble .Ldgemm_ncopy_L4_BEGIN -dgemm_ncopy_L8_M8_BEGIN: +.Ldgemm_ncopy_L8_M8_BEGIN: mov A01, A00 add A02, A01, LDA @@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN: asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L8_M8_40 + ble .Ldgemm_ncopy_L8_M8_40 -dgemm_ncopy_L8_M8_20: +.Ldgemm_ncopy_L8_M8_20: COPY8x8 subs I , I , #1 - bne dgemm_ncopy_L8_M8_20 + bne .Ldgemm_ncopy_L8_M8_20 -dgemm_ncopy_L8_M8_40: +.Ldgemm_ncopy_L8_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L8_M8_END + ble .Ldgemm_ncopy_L8_M8_END -dgemm_ncopy_L8_M8_60: +.Ldgemm_ncopy_L8_M8_60: COPY1x8 subs I , I , #1 - bne dgemm_ncopy_L8_M8_60 + bne .Ldgemm_ncopy_L8_M8_60 -dgemm_ncopy_L8_M8_END: +.Ldgemm_ncopy_L8_M8_END: subs J , J, #1 // j-- - bne dgemm_ncopy_L8_M8_BEGIN + bne .Ldgemm_ncopy_L8_M8_BEGIN /*********************************************************************************************/ -dgemm_ncopy_L4_BEGIN: +.Ldgemm_ncopy_L4_BEGIN: tst N, #7 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #4 - ble dgemm_ncopy_L2_BEGIN + ble .Ldgemm_ncopy_L2_BEGIN -dgemm_ncopy_L4_M8_BEGIN: +.Ldgemm_ncopy_L4_M8_BEGIN: mov A01, A00 add A02, A01, LDA @@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN: asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L4_M8_40 + ble .Ldgemm_ncopy_L4_M8_40 -dgemm_ncopy_L4_M8_20: +.Ldgemm_ncopy_L4_M8_20: COPY8x4 subs I , I , #1 - bne dgemm_ncopy_L4_M8_20 + bne .Ldgemm_ncopy_L4_M8_20 -dgemm_ncopy_L4_M8_40: +.Ldgemm_ncopy_L4_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L4_M8_END + ble .Ldgemm_ncopy_L4_M8_END -dgemm_ncopy_L4_M8_60: +.Ldgemm_ncopy_L4_M8_60: COPY1x4 subs I , I , #1 - bne dgemm_ncopy_L4_M8_60 + bne .Ldgemm_ncopy_L4_M8_60 -dgemm_ncopy_L4_M8_END: +.Ldgemm_ncopy_L4_M8_END: /*********************************************************************************************/ -dgemm_ncopy_L2_BEGIN: +.Ldgemm_ncopy_L2_BEGIN: tst N, #3 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #2 - ble dgemm_ncopy_L1_BEGIN + ble .Ldgemm_ncopy_L1_BEGIN -dgemm_ncopy_L2_M8_BEGIN: +.Ldgemm_ncopy_L2_M8_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L2_M8_40 + ble .Ldgemm_ncopy_L2_M8_40 -dgemm_ncopy_L2_M8_20: +.Ldgemm_ncopy_L2_M8_20: COPY8x2 subs I , I , #1 - bne dgemm_ncopy_L2_M8_20 + bne .Ldgemm_ncopy_L2_M8_20 -dgemm_ncopy_L2_M8_40: +.Ldgemm_ncopy_L2_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L2_M8_END + ble .Ldgemm_ncopy_L2_M8_END -dgemm_ncopy_L2_M8_60: +.Ldgemm_ncopy_L2_M8_60: COPY1x2 subs I , I , #1 - bne dgemm_ncopy_L2_M8_60 + bne .Ldgemm_ncopy_L2_M8_60 -dgemm_ncopy_L2_M8_END: +.Ldgemm_ncopy_L2_M8_END: /*********************************************************************************************/ -dgemm_ncopy_L1_BEGIN: +.Ldgemm_ncopy_L1_BEGIN: tst N, #1 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 -dgemm_ncopy_L1_M8_BEGIN: +.Ldgemm_ncopy_L1_M8_BEGIN: mov A01, A00 asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L1_M8_40 + ble .Ldgemm_ncopy_L1_M8_40 -dgemm_ncopy_L1_M8_20: +.Ldgemm_ncopy_L1_M8_20: COPY8x1 subs I , I , #1 - bne dgemm_ncopy_L1_M8_20 + bne .Ldgemm_ncopy_L1_M8_20 -dgemm_ncopy_L1_M8_40: +.Ldgemm_ncopy_L1_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L1_M8_END + ble .Ldgemm_ncopy_L1_M8_END -dgemm_ncopy_L1_M8_60: +.Ldgemm_ncopy_L1_M8_60: COPY1x1 subs I , I , #1 - bne dgemm_ncopy_L1_M8_60 + bne .Ldgemm_ncopy_L1_M8_60 -dgemm_ncopy_L1_M8_END: +.Ldgemm_ncopy_L1_M8_END: -dgemm_ncopy_L999: +.Ldgemm_ncopy_L999: mov x0, #0 RESTORE_REGS diff --git a/kernel/arm64/dgemm_tcopy_4.S b/kernel/arm64/dgemm_tcopy_4.S index 5b2ed43f1..7c9135287 100644 --- a/kernel/arm64/dgemm_tcopy_4.S +++ b/kernel/arm64/dgemm_tcopy_4.S @@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl M4, M, #5 // M4 = M * 4 * SIZE -dgemm_tcopy_L4_BEGIN: +.Ldgemm_tcopy_L4_BEGIN: asr J, M, #2 // J = M / 4 cmp J, #0 - ble dgemm_tcopy_L2_BEGIN + ble .Ldgemm_tcopy_L2_BEGIN .align 5 -dgemm_tcopy_L4_M4_BEGIN: +.Ldgemm_tcopy_L4_M4_BEGIN: mov A01, A add A02, A01, LDA @@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN: asr I, N, #2 // I = N / 4 cmp I, #0 - ble dgemm_tcopy_L4_M4_40 + ble .Ldgemm_tcopy_L4_M4_40 .align 5 -dgemm_tcopy_L4_M4_20: +.Ldgemm_tcopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne dgemm_tcopy_L4_M4_20 + bne .Ldgemm_tcopy_L4_M4_20 -dgemm_tcopy_L4_M4_40: +.Ldgemm_tcopy_L4_M4_40: tst N , #2 - ble dgemm_tcopy_L4_M4_60 + ble .Ldgemm_tcopy_L4_M4_60 COPY2x4 -dgemm_tcopy_L4_M4_60: +.Ldgemm_tcopy_L4_M4_60: tst N, #1 - ble dgemm_tcopy_L4_M4_END + ble .Ldgemm_tcopy_L4_M4_END COPY1x4 -dgemm_tcopy_L4_M4_END: +.Ldgemm_tcopy_L4_M4_END: subs J , J, #1 // j-- - bne dgemm_tcopy_L4_M4_BEGIN + bne .Ldgemm_tcopy_L4_M4_BEGIN /*********************************************************************************************/ -dgemm_tcopy_L2_BEGIN: +.Ldgemm_tcopy_L2_BEGIN: tst M, #3 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #2 - ble dgemm_tcopy_L1_BEGIN + ble .Ldgemm_tcopy_L1_BEGIN -dgemm_tcopy_L2_M4_BEGIN: +.Ldgemm_tcopy_L2_M4_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA @@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN: asr I, N, #2 // I = N / 4 cmp I, #0 - ble dgemm_tcopy_L2_M4_40 + ble .Ldgemm_tcopy_L2_M4_40 .align 5 -dgemm_tcopy_L2_M4_20: +.Ldgemm_tcopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne dgemm_tcopy_L2_M4_20 + bne .Ldgemm_tcopy_L2_M4_20 -dgemm_tcopy_L2_M4_40: +.Ldgemm_tcopy_L2_M4_40: tst N , #2 - ble dgemm_tcopy_L2_M4_60 + ble .Ldgemm_tcopy_L2_M4_60 COPY2x2 -dgemm_tcopy_L2_M4_60: +.Ldgemm_tcopy_L2_M4_60: tst N , #1 - ble dgemm_tcopy_L2_M4_END + ble .Ldgemm_tcopy_L2_M4_END COPY1x2 -dgemm_tcopy_L2_M4_END: +.Ldgemm_tcopy_L2_M4_END: /*********************************************************************************************/ -dgemm_tcopy_L1_BEGIN: +.Ldgemm_tcopy_L1_BEGIN: tst M, #1 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 -dgemm_tcopy_L1_M4_BEGIN: +.Ldgemm_tcopy_L1_M4_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #2 // I = M / 4 cmp I, #0 - ble dgemm_tcopy_L1_M4_40 + ble .Ldgemm_tcopy_L1_M4_40 .align 5 -dgemm_tcopy_L1_M4_20: +.Ldgemm_tcopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne dgemm_tcopy_L1_M4_20 + bne .Ldgemm_tcopy_L1_M4_20 -dgemm_tcopy_L1_M4_40: +.Ldgemm_tcopy_L1_M4_40: tst N , #2 - ble dgemm_tcopy_L1_M4_60 + ble .Ldgemm_tcopy_L1_M4_60 COPY2x1 -dgemm_tcopy_L1_M4_60: +.Ldgemm_tcopy_L1_M4_60: tst N , #1 - ble dgemm_tcopy_L1_M4_END + ble .Ldgemm_tcopy_L1_M4_END COPY1x1 -dgemm_tcopy_L1_M4_END: +.Ldgemm_tcopy_L1_M4_END: -dgemm_tcopy_L999: +.Ldgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S index 1c57e30e0..9ab51ff57 100644 --- a/kernel/arm64/dgemm_tcopy_8.S +++ b/kernel/arm64/dgemm_tcopy_8.S @@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl M8, M, #6 // M8 = M * 8 * SIZE -dgemm_tcopy_L8_BEGIN: +.Ldgemm_tcopy_L8_BEGIN: asr J, M, #3 // J = M / 4 cmp J, #0 - ble dgemm_tcopy_L4_BEGIN + ble .Ldgemm_tcopy_L4_BEGIN .align 5 -dgemm_tcopy_L8_M8_BEGIN: +.Ldgemm_tcopy_L8_M8_BEGIN: mov A01, A add A02, A01, LDA @@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L8_M8_40 + ble .Ldgemm_tcopy_L8_M8_40 .align 5 -dgemm_tcopy_L8_M8_20: +.Ldgemm_tcopy_L8_M8_20: COPY8x8 subs I , I , #1 - bne dgemm_tcopy_L8_M8_20 + bne .Ldgemm_tcopy_L8_M8_20 -dgemm_tcopy_L8_M8_40: +.Ldgemm_tcopy_L8_M8_40: tst N , #4 - ble dgemm_tcopy_L8_M8_60 + ble .Ldgemm_tcopy_L8_M8_60 COPY4x8 -dgemm_tcopy_L8_M8_60: +.Ldgemm_tcopy_L8_M8_60: tst N , #2 - ble dgemm_tcopy_L8_M8_80 + ble .Ldgemm_tcopy_L8_M8_80 COPY2x8 -dgemm_tcopy_L8_M8_80: +.Ldgemm_tcopy_L8_M8_80: tst N, #1 - ble dgemm_tcopy_L8_M8_END + ble .Ldgemm_tcopy_L8_M8_END COPY1x8 -dgemm_tcopy_L8_M8_END: +.Ldgemm_tcopy_L8_M8_END: subs J , J, #1 // j-- - bne dgemm_tcopy_L8_M8_BEGIN + bne .Ldgemm_tcopy_L8_M8_BEGIN /*********************************************************************************************/ -dgemm_tcopy_L4_BEGIN: +.Ldgemm_tcopy_L4_BEGIN: tst M, #7 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #4 - ble dgemm_tcopy_L2_BEGIN + ble .Ldgemm_tcopy_L2_BEGIN -dgemm_tcopy_L4_M8_BEGIN: +.Ldgemm_tcopy_L4_M8_BEGIN: mov A01, A add A02, A01, LDA @@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L4_M8_40 + ble .Ldgemm_tcopy_L4_M8_40 .align 5 -dgemm_tcopy_L4_M8_20: +.Ldgemm_tcopy_L4_M8_20: COPY8x4 subs I , I , #1 - bne dgemm_tcopy_L4_M8_20 + bne .Ldgemm_tcopy_L4_M8_20 -dgemm_tcopy_L4_M8_40: +.Ldgemm_tcopy_L4_M8_40: tst N , #4 - ble dgemm_tcopy_L4_M8_60 + ble .Ldgemm_tcopy_L4_M8_60 COPY4x4 -dgemm_tcopy_L4_M8_60: +.Ldgemm_tcopy_L4_M8_60: tst N , #2 - ble dgemm_tcopy_L4_M8_80 + ble .Ldgemm_tcopy_L4_M8_80 COPY2x4 -dgemm_tcopy_L4_M8_80: +.Ldgemm_tcopy_L4_M8_80: tst N, #1 - ble dgemm_tcopy_L4_M8_END + ble .Ldgemm_tcopy_L4_M8_END COPY1x4 -dgemm_tcopy_L4_M8_END: +.Ldgemm_tcopy_L4_M8_END: /*********************************************************************************************/ -dgemm_tcopy_L2_BEGIN: +.Ldgemm_tcopy_L2_BEGIN: tst M, #3 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #2 - ble dgemm_tcopy_L1_BEGIN + ble .Ldgemm_tcopy_L1_BEGIN -dgemm_tcopy_L2_M8_BEGIN: +.Ldgemm_tcopy_L2_M8_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA @@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L2_M8_40 + ble .Ldgemm_tcopy_L2_M8_40 .align 5 -dgemm_tcopy_L2_M8_20: +.Ldgemm_tcopy_L2_M8_20: COPY8x2 subs I , I , #1 - bne dgemm_tcopy_L2_M8_20 + bne .Ldgemm_tcopy_L2_M8_20 -dgemm_tcopy_L2_M8_40: +.Ldgemm_tcopy_L2_M8_40: tst N , #4 - ble dgemm_tcopy_L2_M8_60 + ble .Ldgemm_tcopy_L2_M8_60 COPY4x2 -dgemm_tcopy_L2_M8_60: +.Ldgemm_tcopy_L2_M8_60: tst N , #2 - ble dgemm_tcopy_L2_M8_80 + ble .Ldgemm_tcopy_L2_M8_80 COPY2x2 -dgemm_tcopy_L2_M8_80: +.Ldgemm_tcopy_L2_M8_80: tst N , #1 - ble dgemm_tcopy_L2_M8_END + ble .Ldgemm_tcopy_L2_M8_END COPY1x2 -dgemm_tcopy_L2_M8_END: +.Ldgemm_tcopy_L2_M8_END: /*********************************************************************************************/ -dgemm_tcopy_L1_BEGIN: +.Ldgemm_tcopy_L1_BEGIN: tst M, #1 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 -dgemm_tcopy_L1_M8_BEGIN: +.Ldgemm_tcopy_L1_M8_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #3 // I = M / 8 cmp I, #0 - ble dgemm_tcopy_L1_M8_40 + ble .Ldgemm_tcopy_L1_M8_40 .align 5 -dgemm_tcopy_L1_M8_20: +.Ldgemm_tcopy_L1_M8_20: COPY8x1 subs I , I , #1 - bne dgemm_tcopy_L1_M8_20 + bne .Ldgemm_tcopy_L1_M8_20 -dgemm_tcopy_L1_M8_40: +.Ldgemm_tcopy_L1_M8_40: tst N , #4 - ble dgemm_tcopy_L1_M8_60 + ble .Ldgemm_tcopy_L1_M8_60 COPY4x1 -dgemm_tcopy_L1_M8_60: +.Ldgemm_tcopy_L1_M8_60: tst N , #2 - ble dgemm_tcopy_L1_M8_80 + ble .Ldgemm_tcopy_L1_M8_80 COPY2x1 -dgemm_tcopy_L1_M8_80: +.Ldgemm_tcopy_L1_M8_80: tst N , #1 - ble dgemm_tcopy_L1_M8_END + ble .Ldgemm_tcopy_L1_M8_END COPY1x1 -dgemm_tcopy_L1_M8_END: +.Ldgemm_tcopy_L1_M8_END: -dgemm_tcopy_L999: +.Ldgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret diff --git a/kernel/arm64/dot.S b/kernel/arm64/dot.S index 35d47790c..a1a5bf20b 100644 --- a/kernel/arm64/dot.S +++ b/kernel/arm64/dot.S @@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble dot_kernel_L999 + ble .Ldot_kernel_L999 cmp INC_X, #1 - bne dot_kernel_S_BEGIN + bne .Ldot_kernel_S_BEGIN cmp INC_Y, #1 - bne dot_kernel_S_BEGIN + bne .Ldot_kernel_S_BEGIN -dot_kernel_F_BEGIN: +.Ldot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq dot_kernel_F1 + beq .Ldot_kernel_F1 -dot_kernel_F4: +.Ldot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne dot_kernel_F4 + bne .Ldot_kernel_F4 KERNEL_F4_FINALIZE -dot_kernel_F1: +.Ldot_kernel_F1: ands I, N, #3 - ble dot_kernel_L999 + ble .Ldot_kernel_L999 -dot_kernel_F10: +.Ldot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne dot_kernel_F10 + bne .Ldot_kernel_F10 ret -dot_kernel_S_BEGIN: +.Ldot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble dot_kernel_S1 + ble .Ldot_kernel_S1 -dot_kernel_S4: +.Ldot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -206,21 +206,21 @@ dot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S4 + bne .Ldot_kernel_S4 -dot_kernel_S1: +.Ldot_kernel_S1: ands I, N, #3 - ble dot_kernel_L999 + ble .Ldot_kernel_L999 -dot_kernel_S10: +.Ldot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S10 + bne .Ldot_kernel_S10 -dot_kernel_L999: +.Ldot_kernel_L999: ret diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S index 34fb8c233..b528aeb18 100644 --- a/kernel/arm64/dtrmm_kernel_4x4.S +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M4_32 + blt .Ldtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L4_M4_22a + ble .Ldtrmm_kernel_L4_M4_22a .align 5 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_22a: +.Ldtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_32: +.Ldtrmm_kernel_L4_M4_32: tst counterL, #1 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: INIT4x4 -dtrmm_kernel_L4_M4_44: +.Ldtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_46: +.Ldtrmm_kernel_L4_M4_46: KERNEL4x4_SUB -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 @@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M4_20 + bne .Ldtrmm_kernel_L4_M4_20 -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -838,19 +838,19 @@ dtrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L4_BEGIN + bgt .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 @@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M4_20 + bgt .Ldtrmm_kernel_L2_M4_20 -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 @@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M4_20 + bgt .Ldtrmm_kernel_L1_M4_20 -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S index 4aecf28eb..47956dec5 100644 --- a/kernel/arm64/dtrmm_kernel_4x8.S +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble dtrmm_kernel_L4_BEGIN + ble .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L8_BEGIN: +.Ldtrmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L8_M4_BEGIN: +.Ldtrmm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L8_M2_BEGIN + ble .Ldtrmm_kernel_L8_M2_BEGIN -dtrmm_kernel_L8_M4_20: +.Ldtrmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20: asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L8_M4_32 + blt .Ldtrmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L8_M4_22a + ble .Ldtrmm_kernel_L8_M4_22a .align 5 -dtrmm_kernel_L8_M4_22: +.Ldtrmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M4_22 + bgt .Ldtrmm_kernel_L8_M4_22 -dtrmm_kernel_L8_M4_22a: +.Ldtrmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b dtrmm_kernel_L8_M4_44 + b .Ldtrmm_kernel_L8_M4_44 -dtrmm_kernel_L8_M4_32: +.Ldtrmm_kernel_L8_M4_32: tst counterL, #1 - ble dtrmm_kernel_L8_M4_40 + ble .Ldtrmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b dtrmm_kernel_L8_M4_44 + b .Ldtrmm_kernel_L8_M4_44 -dtrmm_kernel_L8_M4_40: +.Ldtrmm_kernel_L8_M4_40: INIT4x8 -dtrmm_kernel_L8_M4_44: +.Ldtrmm_kernel_L8_M4_44: ands counterL, tempK, #1 - ble dtrmm_kernel_L8_M4_100 + ble .Ldtrmm_kernel_L8_M4_100 -dtrmm_kernel_L8_M4_46: +.Ldtrmm_kernel_L8_M4_46: KERNEL4x8_SUB -dtrmm_kernel_L8_M4_100: +.Ldtrmm_kernel_L8_M4_100: SAVE4x8 @@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L8_M4_END: +.Ldtrmm_kernel_L8_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L8_M4_20 + bne .Ldtrmm_kernel_L8_M4_20 -dtrmm_kernel_L8_M2_BEGIN: +.Ldtrmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L8_END + ble .Ldtrmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L8_M1_BEGIN + ble .Ldtrmm_kernel_L8_M1_BEGIN -dtrmm_kernel_L8_M2_20: +.Ldtrmm_kernel_L8_M2_20: INIT2x8 @@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20: asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L8_M2_40 + ble .Ldtrmm_kernel_L8_M2_40 -dtrmm_kernel_L8_M2_22: +.Ldtrmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M2_22 + bgt .Ldtrmm_kernel_L8_M2_22 -dtrmm_kernel_L8_M2_40: +.Ldtrmm_kernel_L8_M2_40: ands counterL, tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L8_M2_100 + ble .Ldtrmm_kernel_L8_M2_100 -dtrmm_kernel_L8_M2_42: +.Ldtrmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M2_42 + bgt .Ldtrmm_kernel_L8_M2_42 -dtrmm_kernel_L8_M2_100: +.Ldtrmm_kernel_L8_M2_100: SAVE2x8 @@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L8_M2_END: +.Ldtrmm_kernel_L8_M2_END: -dtrmm_kernel_L8_M1_BEGIN: +.Ldtrmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L8_END + ble .Ldtrmm_kernel_L8_END -dtrmm_kernel_L8_M1_20: +.Ldtrmm_kernel_L8_M1_20: INIT1x8 @@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20: asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L8_M1_40 + ble .Ldtrmm_kernel_L8_M1_40 -dtrmm_kernel_L8_M1_22: +.Ldtrmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M1_22 + bgt .Ldtrmm_kernel_L8_M1_22 -dtrmm_kernel_L8_M1_40: +.Ldtrmm_kernel_L8_M1_40: ands counterL, tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L8_M1_100 + ble .Ldtrmm_kernel_L8_M1_100 -dtrmm_kernel_L8_M1_42: +.Ldtrmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M1_42 + bgt .Ldtrmm_kernel_L8_M1_42 -dtrmm_kernel_L8_M1_100: +.Ldtrmm_kernel_L8_M1_100: SAVE1x8 @@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L8_END: +.Ldtrmm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 @@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L8_BEGIN + bgt .Ldtrmm_kernel_L8_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble dtrmm_kernel_L999 + ble .Ldtrmm_kernel_L999 tst counterJ , #4 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20: asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M4_32 + blt .Ldtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L4_M4_22a + ble .Ldtrmm_kernel_L4_M4_22a .align 5 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_22a: +.Ldtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_32: +.Ldtrmm_kernel_L4_M4_32: tst counterL, #1 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: INIT4x4 -dtrmm_kernel_L4_M4_44: +.Ldtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_46: +.Ldtrmm_kernel_L4_M4_46: KERNEL4x4_SUB -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M4_20 + bne .Ldtrmm_kernel_L4_M4_20 -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END: /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M4_20 + bgt .Ldtrmm_kernel_L2_M4_20 -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M4_20 + bgt .Ldtrmm_kernel_L1_M4_20 -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index 2b8173715..0ac5a5f24 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M8_BEGIN: +.Ldtrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L4_M4_BEGIN + ble .Ldtrmm_kernel_L4_M4_BEGIN .align 5 -dtrmm_kernel_L4_M8_20: +.Ldtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20: asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M8_32 + blt .Ldtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K @@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dtrmm_kernel_L4_M8_22a + ble .Ldtrmm_kernel_L4_M8_22a .align 5 -dtrmm_kernel_L4_M8_22: +.Ldtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M8_22 + bgt .Ldtrmm_kernel_L4_M8_22 .align 5 -dtrmm_kernel_L4_M8_22a: +.Ldtrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dtrmm_kernel_L4_M8_44 + b .Ldtrmm_kernel_L4_M8_44 .align 5 -dtrmm_kernel_L4_M8_32: +.Ldtrmm_kernel_L4_M8_32: tst counterL, #1 - ble dtrmm_kernel_L4_M8_40 + ble .Ldtrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dtrmm_kernel_L4_M8_44 + b .Ldtrmm_kernel_L4_M8_44 -dtrmm_kernel_L4_M8_40: +.Ldtrmm_kernel_L4_M8_40: INIT8x4 -dtrmm_kernel_L4_M8_44: +.Ldtrmm_kernel_L4_M8_44: ands counterL , tempK, #7 - ble dtrmm_kernel_L4_M8_100 + ble .Ldtrmm_kernel_L4_M8_100 .align 5 -dtrmm_kernel_L4_M8_46: +.Ldtrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dtrmm_kernel_L4_M8_46 + bne .Ldtrmm_kernel_L4_M8_46 -dtrmm_kernel_L4_M8_100: +.Ldtrmm_kernel_L4_M8_100: SAVE8x4 @@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -dtrmm_kernel_L4_M8_END: +.Ldtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M8_20 + bne .Ldtrmm_kernel_L4_M8_20 -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #4 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: INIT4x4 @@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_42: +.Ldtrmm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_42 + bgt .Ldtrmm_kernel_L4_M4_42 -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 @@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L4_BEGIN + bgt .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -dtrmm_kernel_L2_M8_BEGIN: +.Ldtrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L2_M4_BEGIN + ble .Ldtrmm_kernel_L2_M4_BEGIN -dtrmm_kernel_L2_M8_20: +.Ldtrmm_kernel_L2_M8_20: INIT8x2 @@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M8_40 + ble .Ldtrmm_kernel_L2_M8_40 .align 5 -dtrmm_kernel_L2_M8_22: +.Ldtrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M8_22 + bgt .Ldtrmm_kernel_L2_M8_22 -dtrmm_kernel_L2_M8_40: +.Ldtrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M8_100 + ble .Ldtrmm_kernel_L2_M8_100 -dtrmm_kernel_L2_M8_42: +.Ldtrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M8_42 + bgt .Ldtrmm_kernel_L2_M8_42 -dtrmm_kernel_L2_M8_100: +.Ldtrmm_kernel_L2_M8_100: SAVE8x2 @@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -dtrmm_kernel_L2_M8_END: +.Ldtrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M8_20 + bgt .Ldtrmm_kernel_L2_M8_20 -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 @@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next @@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -dtrmm_kernel_L1_M8_BEGIN: +.Ldtrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L1_M4_BEGIN + ble .Ldtrmm_kernel_L1_M4_BEGIN -dtrmm_kernel_L1_M8_20: +.Ldtrmm_kernel_L1_M8_20: INIT8x1 @@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M8_40 + ble .Ldtrmm_kernel_L1_M8_40 .align 5 -dtrmm_kernel_L1_M8_22: +.Ldtrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M8_22 + bgt .Ldtrmm_kernel_L1_M8_22 -dtrmm_kernel_L1_M8_40: +.Ldtrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M8_100 + ble .Ldtrmm_kernel_L1_M8_100 -dtrmm_kernel_L1_M8_42: +.Ldtrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M8_42 + bgt .Ldtrmm_kernel_L1_M8_42 -dtrmm_kernel_L1_M8_100: +.Ldtrmm_kernel_L1_M8_100: SAVE8x1 @@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -dtrmm_kernel_L1_M8_END: +.Ldtrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M8_20 + bgt .Ldtrmm_kernel_L1_M8_20 -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 @@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S index 162f721c3..658551f4f 100644 --- a/kernel/arm64/gemv_n.S +++ b/kernel/arm64/gemv_n.S @@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble gemv_n_kernel_L999 + ble .Lgemv_n_kernel_L999 cmp M, xzr - ble gemv_n_kernel_L999 + ble .Lgemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ mov J, N cmp INC_Y, #1 - bne gemv_n_kernel_S_BEGIN + bne .Lgemv_n_kernel_S_BEGIN -gemv_n_kernel_F_LOOP: +.Lgemv_n_kernel_F_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP @@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP: mov Y_IPTR, Y mov Y_OPTR, Y -gemv_n_kernel_F32: +.Lgemv_n_kernel_F32: asr I, M, #5 cmp I, xzr - beq gemv_n_kernel_F4 + beq .Lgemv_n_kernel_F4 -gemv_n_kernel_F320: +.Lgemv_n_kernel_F320: KERNEL_F16 KERNEL_F16 subs I, I, #1 - bne gemv_n_kernel_F320 + bne .Lgemv_n_kernel_F320 -gemv_n_kernel_F4: +.Lgemv_n_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr - beq gemv_n_kernel_F1 + beq .Lgemv_n_kernel_F1 -gemv_n_kernel_F40: +.Lgemv_n_kernel_F40: KERNEL_F4 subs I, I, #1 - bne gemv_n_kernel_F40 + bne .Lgemv_n_kernel_F40 -gemv_n_kernel_F1: +.Lgemv_n_kernel_F1: ands I, M, #3 - ble gemv_n_kernel_F_END + ble .Lgemv_n_kernel_F_END -gemv_n_kernel_F10: +.Lgemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 - bne gemv_n_kernel_F10 + bne .Lgemv_n_kernel_F10 -gemv_n_kernel_F_END: +.Lgemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 - bne gemv_n_kernel_F_LOOP + bne .Lgemv_n_kernel_F_LOOP - b gemv_n_kernel_L999 + b .Lgemv_n_kernel_L999 -gemv_n_kernel_S_BEGIN: +.Lgemv_n_kernel_S_BEGIN: INIT_S -gemv_n_kernel_S_LOOP: +.Lgemv_n_kernel_S_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP @@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble gemv_n_kernel_S1 + ble .Lgemv_n_kernel_S1 -gemv_n_kernel_S4: +.Lgemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -298,27 +298,27 @@ gemv_n_kernel_S4: KERNEL_S1 subs I, I, #1 - bne gemv_n_kernel_S4 + bne .Lgemv_n_kernel_S4 -gemv_n_kernel_S1: +.Lgemv_n_kernel_S1: ands I, M, #3 - ble gemv_n_kernel_S_END + ble .Lgemv_n_kernel_S_END -gemv_n_kernel_S10: +.Lgemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 - bne gemv_n_kernel_S10 + bne .Lgemv_n_kernel_S10 -gemv_n_kernel_S_END: +.Lgemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 - bne gemv_n_kernel_S_LOOP + bne .Lgemv_n_kernel_S_LOOP -gemv_n_kernel_L999: +.Lgemv_n_kernel_L999: mov w0, wzr diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index 28325f784..b04367ab3 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble gemv_t_kernel_L999 + ble .Lgemv_t_kernel_L999 cmp M, xzr - ble gemv_t_kernel_L999 + ble .Lgemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ mov J, N cmp INC_X, #1 - bne gemv_t_kernel_S_BEGIN + bne .Lgemv_t_kernel_S_BEGIN -gemv_t_kernel_F_LOOP: +.Lgemv_t_kernel_F_LOOP: fmov TEMP, REG0 fmov TEMP1, REG0 @@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP: mov A_PTR, A mov X_PTR, X -gemv_t_kernel_F32: +.Lgemv_t_kernel_F32: asr I, M, #5 cmp I, xzr - beq gemv_t_kernel_F4 + beq .Lgemv_t_kernel_F4 -gemv_t_kernel_F320: +.Lgemv_t_kernel_F320: KERNEL_F32 subs I, I, #1 - bne gemv_t_kernel_F320 + bne .Lgemv_t_kernel_F320 KERNEL_F32_FINALIZE -gemv_t_kernel_F4: +.Lgemv_t_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr - beq gemv_t_kernel_F1 + beq .Lgemv_t_kernel_F1 -gemv_t_kernel_F40: +.Lgemv_t_kernel_F40: KERNEL_F4 subs I, I, #1 - bne gemv_t_kernel_F40 + bne .Lgemv_t_kernel_F40 -gemv_t_kernel_F1: +.Lgemv_t_kernel_F1: KERNEL_F4_FINALIZE ands I, M, #3 - ble gemv_t_kernel_F_END + ble .Lgemv_t_kernel_F_END -gemv_t_kernel_F10: +.Lgemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 - bne gemv_t_kernel_F10 + bne .Lgemv_t_kernel_F10 -gemv_t_kernel_F_END: +.Lgemv_t_kernel_F_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y - bne gemv_t_kernel_F_LOOP + bne .Lgemv_t_kernel_F_LOOP - b gemv_t_kernel_L999 + b .Lgemv_t_kernel_L999 -gemv_t_kernel_S_BEGIN: +.Lgemv_t_kernel_S_BEGIN: INIT_S -gemv_t_kernel_S_LOOP: +.Lgemv_t_kernel_S_LOOP: fmov TEMP, REG0 mov A_PTR, A @@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble gemv_t_kernel_S1 + ble .Lgemv_t_kernel_S1 -gemv_t_kernel_S4: +.Lgemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -329,30 +329,30 @@ gemv_t_kernel_S4: KERNEL_S1 subs I, I, #1 - bne gemv_t_kernel_S4 + bne .Lgemv_t_kernel_S4 -gemv_t_kernel_S1: +.Lgemv_t_kernel_S1: ands I, M, #3 - ble gemv_t_kernel_S_END + ble .Lgemv_t_kernel_S_END -gemv_t_kernel_S10: +.Lgemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 - bne gemv_t_kernel_S10 + bne .Lgemv_t_kernel_S10 -gemv_t_kernel_S_END: +.Lgemv_t_kernel_S_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y - bne gemv_t_kernel_S_LOOP + bne .Lgemv_t_kernel_S_LOOP -gemv_t_kernel_L999: +.Lgemv_t_kernel_L999: RESTORE_REGS diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S index 6c0d84f98..31d0cd646 100644 --- a/kernel/arm64/iamax.S +++ b/kernel/arm64/iamax.S @@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble iamax_kernel_zero + ble .Liamax_kernel_zero cmp INC_X, xzr - ble iamax_kernel_zero + ble .Liamax_kernel_zero cmp INC_X, #1 - bne iamax_kernel_S_BEGIN + bne .Liamax_kernel_S_BEGIN mov x7, X -iamax_kernel_F_BEGIN: +.Liamax_kernel_F_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 asr I, N, #3 cmp I, xzr - beq iamax_kernel_F1 + beq .Liamax_kernel_F1 add Z, Z, #1 -iamax_kernel_F8: +.Liamax_kernel_F8: KERNEL_F8 subs I, I, #1 - bne iamax_kernel_F8 + bne .Liamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 -iamax_kernel_F1: +.Liamax_kernel_F1: ands I, N, #7 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 -iamax_kernel_F10: +.Liamax_kernel_F10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_F10 + bne .Liamax_kernel_F10 - b iamax_kernel_L999 + b .Liamax_kernel_L999 -iamax_kernel_S_BEGIN: +.Liamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble iamax_kernel_S1 + ble .Liamax_kernel_S1 -iamax_kernel_S4: +.Liamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -293,25 +293,25 @@ iamax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S4 + bne .Liamax_kernel_S4 -iamax_kernel_S1: +.Liamax_kernel_S1: ands I, N, #3 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 -iamax_kernel_S10: +.Liamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S10 + bne .Liamax_kernel_S10 -iamax_kernel_L999: +.Liamax_kernel_L999: mov x0, INDEX ret -iamax_kernel_zero: +.Liamax_kernel_zero: mov x0, xzr ret diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S index 9b252ec98..42fa4e711 100644 --- a/kernel/arm64/izamax.S +++ b/kernel/arm64/izamax.S @@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble iamax_kernel_zero + ble .Lizamax_kernel_zero cmp INC_X, xzr - ble iamax_kernel_zero + ble .Lizamax_kernel_zero cmp INC_X, #1 - bne iamax_kernel_S_BEGIN + bne .Lizamax_kernel_S_BEGIN mov x7, X -iamax_kernel_F_BEGIN: +.Lizamax_kernel_F_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 asr I, N, #3 cmp I, xzr - ble iamax_kernel_F1 + ble .Lizamax_kernel_F1 add Z, Z, #1 -iamax_kernel_F8: +.Lizamax_kernel_F8: KERNEL_F8 subs I, I, #1 - bne iamax_kernel_F8 + bne .Lizamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 -iamax_kernel_F1: +.Lizamax_kernel_F1: ands I, N, #7 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 -iamax_kernel_F10: +.Lizamax_kernel_F10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_F10 + bne .Lizamax_kernel_F10 - b iamax_kernel_L999 + b .Lizamax_kernel_L999 -iamax_kernel_S_BEGIN: +.Lizamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble iamax_kernel_S1 + ble .Lizamax_kernel_S1 -iamax_kernel_S4: +.Lizamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -341,26 +341,26 @@ iamax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S4 + bne .Lizamax_kernel_S4 -iamax_kernel_S1: +.Lizamax_kernel_S1: ands I, N, #3 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 -iamax_kernel_S10: +.Lizamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S10 + bne .Lizamax_kernel_S10 -iamax_kernel_L999: +.Lizamax_kernel_L999: mov x0, INDEX ret -iamax_kernel_zero: +.Lizamax_kernel_zero: mov x0, xzr ret diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index 5d06c13c0..e2cbd4def 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -162,44 +162,44 @@ KERNEL_S1_NEXT: INIT cmp N, #0 - ble nrm2_kernel_L999 + ble .Lnrm2_kernel_L999 cmp INC_X, #0 - beq nrm2_kernel_L999 + beq .Lnrm2_kernel_L999 cmp INC_X, #1 - bne nrm2_kernel_S_BEGIN + bne .Lnrm2_kernel_S_BEGIN -nrm2_kernel_F_BEGIN: +.Lnrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr - ble nrm2_kernel_F1 + ble .Lnrm2_kernel_F1 -nrm2_kernel_F8: +.Lnrm2_kernel_F8: KERNEL_F8 subs I, I, #1 - bne nrm2_kernel_F8 + bne .Lnrm2_kernel_F8 -nrm2_kernel_F1: +.Lnrm2_kernel_F1: ands I, N, #7 - ble nrm2_kernel_L999 + ble .Lnrm2_kernel_L999 -nrm2_kernel_F10: +.Lnrm2_kernel_F10: KERNEL_F1 subs I, I, #1 - bne nrm2_kernel_F10 + bne .Lnrm2_kernel_F10 - b nrm2_kernel_L999 + b .Lnrm2_kernel_L999 -nrm2_kernel_S_BEGIN: +.Lnrm2_kernel_S_BEGIN: INIT_S @@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN: .align 5 -nrm2_kernel_S10: +.Lnrm2_kernel_S10: KERNEL_S1 subs I, I, #1 - bne nrm2_kernel_S10 + bne .Lnrm2_kernel_S10 -nrm2_kernel_L999: +.Lnrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ diff --git a/kernel/arm64/rot.S b/kernel/arm64/rot.S index 572125232..00c3085fa 100644 --- a/kernel/arm64/rot.S +++ b/kernel/arm64/rot.S @@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble rot_kernel_L999 + ble .Lrot_kernel_L999 INIT cmp INC_X, #1 - bne rot_kernel_S_BEGIN + bne .Lrot_kernel_S_BEGIN cmp INC_Y, #1 - bne rot_kernel_S_BEGIN + bne .Lrot_kernel_S_BEGIN -rot_kernel_F_BEGIN: +.Lrot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq rot_kernel_F1 + beq .Lrot_kernel_F1 KERNEL_INIT_F4 -rot_kernel_F4: +.Lrot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne rot_kernel_F4 + bne .Lrot_kernel_F4 -rot_kernel_F1: +.Lrot_kernel_F1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lrot_kernel_L999 INIT_F1 -rot_kernel_F10: +.Lrot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne rot_kernel_F10 + bne .Lrot_kernel_F10 mov w0, wzr ret -rot_kernel_S_BEGIN: +.Lrot_kernel_S_BEGIN: INIT_S INIT_F1 @@ -214,9 +214,9 @@ rot_kernel_S_BEGIN: asr I, N, #2 cmp I, xzr - ble rot_kernel_S1 + ble .Lrot_kernel_S1 -rot_kernel_S4: +.Lrot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -224,22 +224,22 @@ rot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S4 + bne .Lrot_kernel_S4 -rot_kernel_S1: +.Lrot_kernel_S1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lrot_kernel_L999 -rot_kernel_S10: +.Lrot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S10 + bne .Lrot_kernel_S10 -rot_kernel_L999: +.Lrot_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/scal.S b/kernel/arm64/scal.S index 91d469d03..09c41cdaa 100644 --- a/kernel/arm64/scal.S +++ b/kernel/arm64/scal.S @@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble scal_kernel_L999 + ble .Lscal_kernel_L999 fcmp DA, #0.0 - beq scal_kernel_zero + beq .Lscal_kernel_zero cmp INC_X, #1 - bne scal_kernel_S_BEGIN + bne .Lscal_kernel_S_BEGIN -scal_kernel_F_BEGIN: +.Lscal_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq scal_kernel_F1 + beq .Lscal_kernel_F1 KERNEL_INIT_F8 -scal_kernel_F8: +.Lscal_kernel_F8: KERNEL_F8 subs I, I, #1 - bne scal_kernel_F8 + bne .Lscal_kernel_F8 -scal_kernel_F1: +.Lscal_kernel_F1: ands I, N, #7 - ble scal_kernel_L999 + ble .Lscal_kernel_L999 -scal_kernel_F10: +.Lscal_kernel_F10: KERNEL_F1 subs I, I, #1 - bne scal_kernel_F10 + bne .Lscal_kernel_F10 mov w0, wzr ret -scal_kernel_S_BEGIN: +.Lscal_kernel_S_BEGIN: INIT_S mov X_COPY, X asr I, N, #2 cmp I, xzr - ble scal_kernel_S1 + ble .Lscal_kernel_S1 -scal_kernel_S4: +.Lscal_kernel_S4: KERNEL_S4 subs I, I, #1 - bne scal_kernel_S4 + bne .Lscal_kernel_S4 -scal_kernel_S1: +.Lscal_kernel_S1: ands I, N, #3 - ble scal_kernel_L999 + ble .Lscal_kernel_L999 -scal_kernel_S10: +.Lscal_kernel_S10: KERNEL_S1 subs I, I, #1 - bne scal_kernel_S10 + bne .Lscal_kernel_S10 -scal_kernel_L999: +.Lscal_kernel_L999: mov w0, wzr ret -scal_kernel_zero: +.Lscal_kernel_zero: INIT_S -scal_kernel_Z1: +.Lscal_kernel_Z1: st1 DAV, [X], INC_X subs N, N, #1 - bne scal_kernel_Z1 + bne .Lscal_kernel_Z1 mov w0, wzr ret diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 6e3645b76..99099ea6f 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -1070,7 +1070,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1098,11 +1098,11 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1112,21 +1112,21 @@ sgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN .align 5 -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1138,10 +1138,10 @@ sgemm_kernel_L4_M16_20: KERNEL16x4_M2 subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1153,10 +1153,10 @@ sgemm_kernel_L4_M16_22: KERNEL16x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 .align 5 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1167,13 +1167,13 @@ sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 .align 5 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1184,187 +1184,187 @@ sgemm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #7 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 .align 5 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M16_46 + bne .Lsgemm_kernel_L4_M16_46 -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1372,9 +1372,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1387,34 +1387,34 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1422,9 +1422,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1436,42 +1436,42 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1479,14 +1479,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -sgemm_kernel_L2_M16_BEGIN: +.Lsgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble sgemm_kernel_L2_M8_BEGIN + ble .Lsgemm_kernel_L2_M8_BEGIN -sgemm_kernel_L2_M16_20: +.Lsgemm_kernel_L2_M16_20: INIT16x2 @@ -1494,10 +1494,10 @@ sgemm_kernel_L2_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M16_40 + ble .Lsgemm_kernel_L2_M16_40 .align 5 -sgemm_kernel_L2_M16_22: +.Lsgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1509,41 +1509,41 @@ sgemm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_22 + bgt .Lsgemm_kernel_L2_M16_22 -sgemm_kernel_L2_M16_40: +.Lsgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M16_100 + ble .Lsgemm_kernel_L2_M16_100 -sgemm_kernel_L2_M16_42: +.Lsgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_42 + bgt .Lsgemm_kernel_L2_M16_42 -sgemm_kernel_L2_M16_100: +.Lsgemm_kernel_L2_M16_100: SAVE16x2 -sgemm_kernel_L2_M16_END: +.Lsgemm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M16_20 + bgt .Lsgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #8 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1551,10 +1551,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1566,38 +1566,38 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1605,10 +1605,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1620,40 +1620,40 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1661,9 +1661,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1676,34 +1676,34 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -1711,9 +1711,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1725,36 +1725,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1762,14 +1762,14 @@ sgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -sgemm_kernel_L1_M16_BEGIN: +.Lsgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L1_M8_BEGIN + ble .Lsgemm_kernel_L1_M8_BEGIN -sgemm_kernel_L1_M16_20: +.Lsgemm_kernel_L1_M16_20: INIT16x1 @@ -1777,10 +1777,10 @@ sgemm_kernel_L1_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M16_40 + ble .Lsgemm_kernel_L1_M16_40 .align 5 -sgemm_kernel_L1_M16_22: +.Lsgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -1792,42 +1792,42 @@ sgemm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_22 + bgt .Lsgemm_kernel_L1_M16_22 -sgemm_kernel_L1_M16_40: +.Lsgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M16_100 + ble .Lsgemm_kernel_L1_M16_100 -sgemm_kernel_L1_M16_42: +.Lsgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_42 + bgt .Lsgemm_kernel_L1_M16_42 -sgemm_kernel_L1_M16_100: +.Lsgemm_kernel_L1_M16_100: SAVE16x1 -sgemm_kernel_L1_M16_END: +.Lsgemm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M16_20 + bgt .Lsgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #8 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -1835,10 +1835,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1850,38 +1850,38 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -1889,10 +1889,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1904,39 +1904,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -1944,9 +1944,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1959,34 +1959,34 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -1994,9 +1994,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2008,28 +2008,28 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S b/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S index 0ee10e130..144d4bcd6 100644 --- a/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S +++ b/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S @@ -1117,7 +1117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1145,11 +1145,11 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1159,21 +1159,21 @@ sgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN .align 5 -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #4 // L = K / 16 cmp counterL , #2 - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1182,18 +1182,18 @@ sgemm_kernel_L4_M16_20: KERNEL16x4_M1_M2_x1 subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1_M2_x8 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 .align 5 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x2 @@ -1201,13 +1201,13 @@ sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 .align 5 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1216,187 +1216,187 @@ sgemm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #15 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 .align 5 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M16_46 + bne .Lsgemm_kernel_L4_M16_46 -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1404,9 +1404,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1419,34 +1419,34 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1454,9 +1454,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1468,42 +1468,42 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1511,14 +1511,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -sgemm_kernel_L2_M16_BEGIN: +.Lsgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble sgemm_kernel_L2_M8_BEGIN + ble .Lsgemm_kernel_L2_M8_BEGIN -sgemm_kernel_L2_M16_20: +.Lsgemm_kernel_L2_M16_20: INIT16x2 @@ -1526,10 +1526,10 @@ sgemm_kernel_L2_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M16_40 + ble .Lsgemm_kernel_L2_M16_40 .align 5 -sgemm_kernel_L2_M16_22: +.Lsgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1541,41 +1541,41 @@ sgemm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_22 + bgt .Lsgemm_kernel_L2_M16_22 -sgemm_kernel_L2_M16_40: +.Lsgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M16_100 + ble .Lsgemm_kernel_L2_M16_100 -sgemm_kernel_L2_M16_42: +.Lsgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_42 + bgt .Lsgemm_kernel_L2_M16_42 -sgemm_kernel_L2_M16_100: +.Lsgemm_kernel_L2_M16_100: SAVE16x2 -sgemm_kernel_L2_M16_END: +.Lsgemm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M16_20 + bgt .Lsgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #8 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1583,10 +1583,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1598,38 +1598,38 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1637,10 +1637,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1652,40 +1652,40 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1693,9 +1693,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1708,34 +1708,34 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -1743,9 +1743,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1757,36 +1757,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1794,14 +1794,14 @@ sgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -sgemm_kernel_L1_M16_BEGIN: +.Lsgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L1_M8_BEGIN + ble .Lsgemm_kernel_L1_M8_BEGIN -sgemm_kernel_L1_M16_20: +.Lsgemm_kernel_L1_M16_20: INIT16x1 @@ -1809,10 +1809,10 @@ sgemm_kernel_L1_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M16_40 + ble .Lsgemm_kernel_L1_M16_40 .align 5 -sgemm_kernel_L1_M16_22: +.Lsgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -1824,42 +1824,42 @@ sgemm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_22 + bgt .Lsgemm_kernel_L1_M16_22 -sgemm_kernel_L1_M16_40: +.Lsgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M16_100 + ble .Lsgemm_kernel_L1_M16_100 -sgemm_kernel_L1_M16_42: +.Lsgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_42 + bgt .Lsgemm_kernel_L1_M16_42 -sgemm_kernel_L1_M16_100: +.Lsgemm_kernel_L1_M16_100: SAVE16x1 -sgemm_kernel_L1_M16_END: +.Lsgemm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M16_20 + bgt .Lsgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #8 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -1867,10 +1867,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1882,38 +1882,38 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -1921,10 +1921,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1936,39 +1936,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -1976,9 +1976,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1991,34 +1991,34 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -2026,9 +2026,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2040,28 +2040,28 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index a5cf7baff..76c11f1e1 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN: add pA_2, temp, pA_1 add pA_3, temp, pA_2 -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I // do one in the K KERNEL16x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: lsl temp, origK, #4 // k * 4 * 4 = Four rows of A add pA_0, pA_0, temp add pA_0, pA_0, temp @@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END: add pA_2, pA_1, temp add pA_3, pA_2, temp subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: INIT8x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_SUB KERNEL8x4_SUB @@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22: KERNEL8x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_42: +.Lsgemm_kernel_L4_M8_42: KERNEL8x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_42 + bgt .Lsgemm_kernel_L4_M8_42 -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: lsl temp, origK, #4 // k * 4 * 4 add pA_0, pA_0, temp -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_42: +.Lsgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_42 + bgt .Lsgemm_kernel_L4_M4_42 -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: lsl temp, origK, #4 add origPB, origPB, temp // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M4_20 + bgt .Lsgemm_kernel_L2_M4_20 -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN: -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M4_20 + bgt .Lsgemm_kernel_L1_M4_20 -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S index bd47bed31..6ba64dd35 100644 --- a/kernel/arm64/sgemm_kernel_8x8.S +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -1263,7 +1263,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1291,12 +1291,12 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble sgemm_kernel_L4_BEGIN + ble .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L8_BEGIN: +.Lsgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -1304,156 +1304,156 @@ sgemm_kernel_L8_BEGIN: /******************************************************************************/ -sgemm_kernel_L8_M8_BEGIN: +.Lsgemm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble sgemm_kernel_L8_M4_BEGIN + ble .Lsgemm_kernel_L8_M4_BEGIN -sgemm_kernel_L8_M8_20: +.Lsgemm_kernel_L8_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L8_M8_32 + blt .Lsgemm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L8_M8_22a + ble .Lsgemm_kernel_L8_M8_22a .align 5 -sgemm_kernel_L8_M8_22: +.Lsgemm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M8_22 + bgt .Lsgemm_kernel_L8_M8_22 -sgemm_kernel_L8_M8_22a: +.Lsgemm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E - b sgemm_kernel_L8_M8_44 + b .Lsgemm_kernel_L8_M8_44 -sgemm_kernel_L8_M8_32: +.Lsgemm_kernel_L8_M8_32: tst counterL, #1 - ble sgemm_kernel_L8_M8_40 + ble .Lsgemm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E - b sgemm_kernel_L8_M8_44 + b .Lsgemm_kernel_L8_M8_44 -sgemm_kernel_L8_M8_40: +.Lsgemm_kernel_L8_M8_40: INIT8x8 -sgemm_kernel_L8_M8_44: +.Lsgemm_kernel_L8_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L8_M8_100 + ble .Lsgemm_kernel_L8_M8_100 -sgemm_kernel_L8_M8_46: +.Lsgemm_kernel_L8_M8_46: KERNEL8x8_SUB -sgemm_kernel_L8_M8_100: +.Lsgemm_kernel_L8_M8_100: SAVE8x8 -sgemm_kernel_L8_M8_END: +.Lsgemm_kernel_L8_M8_END: subs counterI, counterI, #1 - bne sgemm_kernel_L8_M8_20 + bne .Lsgemm_kernel_L8_M8_20 /******************************************************************************/ -sgemm_kernel_L8_M4_BEGIN: +.Lsgemm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END tst counterI, #4 - ble sgemm_kernel_L8_M2_BEGIN + ble .Lsgemm_kernel_L8_M2_BEGIN -sgemm_kernel_L8_M4_20: +.Lsgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L8_M4_32 + blt .Lsgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L8_M4_22a + ble .Lsgemm_kernel_L8_M4_22a .align 5 -sgemm_kernel_L8_M4_22: +.Lsgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M4_22 + bgt .Lsgemm_kernel_L8_M4_22 -sgemm_kernel_L8_M4_22a: +.Lsgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b sgemm_kernel_L8_M4_44 + b .Lsgemm_kernel_L8_M4_44 -sgemm_kernel_L8_M4_32: +.Lsgemm_kernel_L8_M4_32: tst counterL, #1 - ble sgemm_kernel_L8_M4_40 + ble .Lsgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b sgemm_kernel_L8_M4_44 + b .Lsgemm_kernel_L8_M4_44 -sgemm_kernel_L8_M4_40: +.Lsgemm_kernel_L8_M4_40: INIT4x8 -sgemm_kernel_L8_M4_44: +.Lsgemm_kernel_L8_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L8_M4_100 + ble .Lsgemm_kernel_L8_M4_100 -sgemm_kernel_L8_M4_46: +.Lsgemm_kernel_L8_M4_46: KERNEL4x8_SUB -sgemm_kernel_L8_M4_100: +.Lsgemm_kernel_L8_M4_100: SAVE4x8 -sgemm_kernel_L8_M4_END: +.Lsgemm_kernel_L8_M4_END: /******************************************************************************/ -sgemm_kernel_L8_M2_BEGIN: +.Lsgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L8_M1_BEGIN + ble .Lsgemm_kernel_L8_M1_BEGIN -sgemm_kernel_L8_M2_20: +.Lsgemm_kernel_L8_M2_20: INIT2x8 @@ -1461,9 +1461,9 @@ sgemm_kernel_L8_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L8_M2_40 + ble .Lsgemm_kernel_L8_M2_40 -sgemm_kernel_L8_M2_22: +.Lsgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1476,35 +1476,35 @@ sgemm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M2_22 + bgt .Lsgemm_kernel_L8_M2_22 -sgemm_kernel_L8_M2_40: +.Lsgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L8_M2_100 + ble .Lsgemm_kernel_L8_M2_100 -sgemm_kernel_L8_M2_42: +.Lsgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M2_42 + bgt .Lsgemm_kernel_L8_M2_42 -sgemm_kernel_L8_M2_100: +.Lsgemm_kernel_L8_M2_100: SAVE2x8 -sgemm_kernel_L8_M2_END: +.Lsgemm_kernel_L8_M2_END: /******************************************************************************/ -sgemm_kernel_L8_M1_BEGIN: +.Lsgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END -sgemm_kernel_L8_M1_20: +.Lsgemm_kernel_L8_M1_20: INIT1x8 @@ -1512,9 +1512,9 @@ sgemm_kernel_L8_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L8_M1_40 + ble .Lsgemm_kernel_L8_M1_40 -sgemm_kernel_L8_M1_22: +.Lsgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1526,43 +1526,43 @@ sgemm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M1_22 + bgt .Lsgemm_kernel_L8_M1_22 -sgemm_kernel_L8_M1_40: +.Lsgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L8_M1_100 + ble .Lsgemm_kernel_L8_M1_100 -sgemm_kernel_L8_M1_42: +.Lsgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M1_42 + bgt .Lsgemm_kernel_L8_M1_42 -sgemm_kernel_L8_M1_100: +.Lsgemm_kernel_L8_M1_100: SAVE1x8 -sgemm_kernel_L8_END: +.Lsgemm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L8_BEGIN + bgt .Lsgemm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #4 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1572,156 +1572,156 @@ sgemm_kernel_L4_BEGIN: /******************************************************************************/ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M8_20 + bne .Lsgemm_kernel_L4_M8_20 /******************************************************************************/ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: /******************************************************************************/ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1729,9 +1729,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1744,35 +1744,35 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: /******************************************************************************/ -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1780,9 +1780,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1794,39 +1794,39 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1836,14 +1836,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction /******************************************************************************/ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1851,10 +1851,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1866,42 +1866,42 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M8_20 + bgt .Lsgemm_kernel_L2_M8_20 /******************************************************************************/ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1909,10 +1909,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1924,39 +1924,39 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: /******************************************************************************/ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1964,9 +1964,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1979,35 +1979,35 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: /******************************************************************************/ -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -2015,9 +2015,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2029,37 +2029,37 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2069,14 +2069,14 @@ sgemm_kernel_L1_BEGIN: /******************************************************************************/ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -2084,10 +2084,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2099,42 +2099,42 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M8_20 + bgt .Lsgemm_kernel_L1_M8_20 /******************************************************************************/ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -2142,10 +2142,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2157,39 +2157,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: /******************************************************************************/ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -2197,9 +2197,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2212,35 +2212,35 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: /******************************************************************************/ -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -2248,9 +2248,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2262,30 +2262,30 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: /******************************************************************************/ -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 77e05103d..985a0a9a6 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -1035,7 +1035,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1066,11 +1066,11 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1084,15 +1084,15 @@ strmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -strmm_kernel_L4_M16_BEGIN: +.Lstrmm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble strmm_kernel_L4_M8_BEGIN + ble .Lstrmm_kernel_L4_M8_BEGIN .align 5 -strmm_kernel_L4_M16_20: +.Lstrmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1114,7 +1114,7 @@ strmm_kernel_L4_M16_20: asr counterL , tempK, #3 cmp counterL , #2 - blt strmm_kernel_L4_M16_32 + blt .Lstrmm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1126,10 +1126,10 @@ strmm_kernel_L4_M16_20: KERNEL16x4_M2 subs counterL, counterL, #2 - ble strmm_kernel_L4_M16_22a + ble .Lstrmm_kernel_L4_M16_22a .align 5 -strmm_kernel_L4_M16_22: +.Lstrmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1141,10 +1141,10 @@ strmm_kernel_L4_M16_22: KERNEL16x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M16_22 + bgt .Lstrmm_kernel_L4_M16_22 .align 5 -strmm_kernel_L4_M16_22a: +.Lstrmm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1155,13 +1155,13 @@ strmm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b strmm_kernel_L4_M16_44 + b .Lstrmm_kernel_L4_M16_44 .align 5 -strmm_kernel_L4_M16_32: +.Lstrmm_kernel_L4_M16_32: tst counterL, #1 - ble strmm_kernel_L4_M16_40 + ble .Lstrmm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1172,25 +1172,25 @@ strmm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b strmm_kernel_L4_M16_44 + b .Lstrmm_kernel_L4_M16_44 -strmm_kernel_L4_M16_40: +.Lstrmm_kernel_L4_M16_40: INIT16x4 -strmm_kernel_L4_M16_44: +.Lstrmm_kernel_L4_M16_44: ands counterL , tempK, #7 - ble strmm_kernel_L4_M16_100 + ble .Lstrmm_kernel_L4_M16_100 .align 5 -strmm_kernel_L4_M16_46: +.Lstrmm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne strmm_kernel_L4_M16_46 + bne .Lstrmm_kernel_L4_M16_46 -strmm_kernel_L4_M16_100: +.Lstrmm_kernel_L4_M16_100: SAVE16x4 @@ -1213,22 +1213,22 @@ strmm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -strmm_kernel_L4_M16_END: +.Lstrmm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M16_20 + bne .Lstrmm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L4_M8_BEGIN: +.Lstrmm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #8 - ble strmm_kernel_L4_M4_BEGIN + ble .Lstrmm_kernel_L4_M4_BEGIN -strmm_kernel_L4_M8_20: +.Lstrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1250,54 +1250,54 @@ strmm_kernel_L4_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M8_32 + blt .Lstrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M8_22a + ble .Lstrmm_kernel_L4_M8_22a .align 5 -strmm_kernel_L4_M8_22: +.Lstrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M8_22 + bgt .Lstrmm_kernel_L4_M8_22 -strmm_kernel_L4_M8_22a: +.Lstrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_32: +.Lstrmm_kernel_L4_M8_32: tst counterL, #1 - ble strmm_kernel_L4_M8_40 + ble .Lstrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_40: +.Lstrmm_kernel_L4_M8_40: INIT8x4 -strmm_kernel_L4_M8_44: +.Lstrmm_kernel_L4_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M8_100 + ble .Lstrmm_kernel_L4_M8_100 -strmm_kernel_L4_M8_46: +.Lstrmm_kernel_L4_M8_46: KERNEL8x4_SUB -strmm_kernel_L4_M8_100: +.Lstrmm_kernel_L4_M8_100: SAVE8x4 @@ -1317,20 +1317,20 @@ strmm_kernel_L4_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L4_M8_END: +.Lstrmm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #4 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1350,54 +1350,54 @@ strmm_kernel_L4_M4_20: #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -1415,20 +1415,20 @@ strmm_kernel_L4_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -1451,9 +1451,9 @@ strmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1466,22 +1466,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -1500,15 +1500,15 @@ strmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -1531,9 +1531,9 @@ strmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1545,22 +1545,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -1579,26 +1579,26 @@ strmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L4_BEGIN + bgt .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1609,14 +1609,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -strmm_kernel_L2_M16_BEGIN: +.Lstrmm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble strmm_kernel_L2_M8_BEGIN + ble .Lstrmm_kernel_L2_M8_BEGIN -strmm_kernel_L2_M16_20: +.Lstrmm_kernel_L2_M16_20: INIT16x2 @@ -1640,10 +1640,10 @@ strmm_kernel_L2_M16_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M16_40 + ble .Lstrmm_kernel_L2_M16_40 .align 5 -strmm_kernel_L2_M16_22: +.Lstrmm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1655,22 +1655,22 @@ strmm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M16_22 + bgt .Lstrmm_kernel_L2_M16_22 -strmm_kernel_L2_M16_40: +.Lstrmm_kernel_L2_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M16_100 + ble .Lstrmm_kernel_L2_M16_100 -strmm_kernel_L2_M16_42: +.Lstrmm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M16_42 + bgt .Lstrmm_kernel_L2_M16_42 -strmm_kernel_L2_M16_100: +.Lstrmm_kernel_L2_M16_100: SAVE16x2 @@ -1690,22 +1690,22 @@ strmm_kernel_L2_M16_100: add tempOffset, tempOffset, #16 #endif -strmm_kernel_L2_M16_END: +.Lstrmm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M16_20 + bgt .Lstrmm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L2_M8_BEGIN: +.Lstrmm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #8 - ble strmm_kernel_L2_M4_BEGIN + ble .Lstrmm_kernel_L2_M4_BEGIN -strmm_kernel_L2_M8_20: +.Lstrmm_kernel_L2_M8_20: INIT8x2 @@ -1729,10 +1729,10 @@ strmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M8_40 + ble .Lstrmm_kernel_L2_M8_40 .align 5 -strmm_kernel_L2_M8_22: +.Lstrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1744,22 +1744,22 @@ strmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_22 + bgt .Lstrmm_kernel_L2_M8_22 -strmm_kernel_L2_M8_40: +.Lstrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M8_100 + ble .Lstrmm_kernel_L2_M8_100 -strmm_kernel_L2_M8_42: +.Lstrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_42 + bgt .Lstrmm_kernel_L2_M8_42 -strmm_kernel_L2_M8_100: +.Lstrmm_kernel_L2_M8_100: SAVE8x2 @@ -1779,19 +1779,19 @@ strmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L2_M8_END: +.Lstrmm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #4 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -1814,10 +1814,10 @@ strmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1829,22 +1829,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -1863,21 +1863,21 @@ strmm_kernel_L2_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -1900,9 +1900,9 @@ strmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1915,22 +1915,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1949,15 +1949,15 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -1980,9 +1980,9 @@ strmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1994,22 +1994,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -2028,7 +2028,7 @@ strmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2036,11 +2036,11 @@ strmm_kernel_L2_END: /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2051,14 +2051,14 @@ strmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -strmm_kernel_L1_M16_BEGIN: +.Lstrmm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble strmm_kernel_L1_M8_BEGIN + ble .Lstrmm_kernel_L1_M8_BEGIN -strmm_kernel_L1_M16_20: +.Lstrmm_kernel_L1_M16_20: INIT16x1 @@ -2082,10 +2082,10 @@ strmm_kernel_L1_M16_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M16_40 + ble .Lstrmm_kernel_L1_M16_40 .align 5 -strmm_kernel_L1_M16_22: +.Lstrmm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -2097,22 +2097,22 @@ strmm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M16_22 + bgt .Lstrmm_kernel_L1_M16_22 -strmm_kernel_L1_M16_40: +.Lstrmm_kernel_L1_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M16_100 + ble .Lstrmm_kernel_L1_M16_100 -strmm_kernel_L1_M16_42: +.Lstrmm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M16_42 + bgt .Lstrmm_kernel_L1_M16_42 -strmm_kernel_L1_M16_100: +.Lstrmm_kernel_L1_M16_100: SAVE16x1 @@ -2132,23 +2132,23 @@ strmm_kernel_L1_M16_100: add tempOffset, tempOffset, #16 #endif -strmm_kernel_L1_M16_END: +.Lstrmm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M16_20 + bgt .Lstrmm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L1_M8_BEGIN: +.Lstrmm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #8 - ble strmm_kernel_L1_M4_BEGIN + ble .Lstrmm_kernel_L1_M4_BEGIN -strmm_kernel_L1_M8_20: +.Lstrmm_kernel_L1_M8_20: INIT8x1 @@ -2172,10 +2172,10 @@ strmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M8_40 + ble .Lstrmm_kernel_L1_M8_40 .align 5 -strmm_kernel_L1_M8_22: +.Lstrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2187,22 +2187,22 @@ strmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_22 + bgt .Lstrmm_kernel_L1_M8_22 -strmm_kernel_L1_M8_40: +.Lstrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M8_100 + ble .Lstrmm_kernel_L1_M8_100 -strmm_kernel_L1_M8_42: +.Lstrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_42 + bgt .Lstrmm_kernel_L1_M8_42 -strmm_kernel_L1_M8_100: +.Lstrmm_kernel_L1_M8_100: SAVE8x1 @@ -2222,19 +2222,19 @@ strmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L1_M8_END: +.Lstrmm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #4 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -2257,10 +2257,10 @@ strmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2272,22 +2272,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -2306,20 +2306,20 @@ strmm_kernel_L1_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -2342,9 +2342,9 @@ strmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2357,22 +2357,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -2391,15 +2391,15 @@ strmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -2422,9 +2422,9 @@ strmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2436,28 +2436,28 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S index eeb3e6e72..5f7818c40 100644 --- a/kernel/arm64/strmm_kernel_4x4.S +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -539,11 +539,11 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M4_20 + bne .Lstrmm_kernel_L4_M4_20 -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100: #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100: #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) @@ -825,19 +825,19 @@ strmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L4_BEGIN + bgt .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M4_20 + bgt .Lstrmm_kernel_L2_M4_20 -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1107,11 +1107,11 @@ strmm_kernel_L2_END: /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M4_20 + bgt .Lstrmm_kernel_L1_M4_20 -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100: #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 @@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100: #endif #endif -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: #if 0 #if !defined(LEFT) @@ -1385,7 +1385,7 @@ strmm_kernel_L1_END: #endif #endif -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S index 843f0c890..cd18e6847 100644 --- a/kernel/arm64/strmm_kernel_8x8.S +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -1257,7 +1257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1288,12 +1288,12 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble strmm_kernel_L4_BEGIN + ble .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L8_BEGIN: +.Lstrmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -1305,14 +1305,14 @@ strmm_kernel_L8_BEGIN: /******************************************************************************/ -strmm_kernel_L8_M8_BEGIN: +.Lstrmm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble strmm_kernel_L8_M4_BEGIN + ble .Lstrmm_kernel_L8_M4_BEGIN -strmm_kernel_L8_M8_20: +.Lstrmm_kernel_L8_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1333,54 +1333,54 @@ strmm_kernel_L8_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L8_M8_32 + blt .Lstrmm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L8_M8_22a + ble .Lstrmm_kernel_L8_M8_22a .align 5 -strmm_kernel_L8_M8_22: +.Lstrmm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L8_M8_22 + bgt .Lstrmm_kernel_L8_M8_22 -strmm_kernel_L8_M8_22a: +.Lstrmm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E - b strmm_kernel_L8_M8_44 + b .Lstrmm_kernel_L8_M8_44 -strmm_kernel_L8_M8_32: +.Lstrmm_kernel_L8_M8_32: tst counterL, #1 - ble strmm_kernel_L8_M8_40 + ble .Lstrmm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E - b strmm_kernel_L8_M8_44 + b .Lstrmm_kernel_L8_M8_44 -strmm_kernel_L8_M8_40: +.Lstrmm_kernel_L8_M8_40: INIT8x8 -strmm_kernel_L8_M8_44: +.Lstrmm_kernel_L8_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L8_M8_100 + ble .Lstrmm_kernel_L8_M8_100 -strmm_kernel_L8_M8_46: +.Lstrmm_kernel_L8_M8_46: KERNEL8x8_SUB -strmm_kernel_L8_M8_100: +.Lstrmm_kernel_L8_M8_100: SAVE8x8 @@ -1399,22 +1399,22 @@ strmm_kernel_L8_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L8_M8_END: +.Lstrmm_kernel_L8_M8_END: subs counterI, counterI, #1 - bne strmm_kernel_L8_M8_20 + bne .Lstrmm_kernel_L8_M8_20 /******************************************************************************/ -strmm_kernel_L8_M4_BEGIN: +.Lstrmm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END tst counterI, #4 - ble strmm_kernel_L8_M2_BEGIN + ble .Lstrmm_kernel_L8_M2_BEGIN -strmm_kernel_L8_M4_20: +.Lstrmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1436,54 +1436,54 @@ strmm_kernel_L8_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L8_M4_32 + blt .Lstrmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L8_M4_22a + ble .Lstrmm_kernel_L8_M4_22a .align 5 -strmm_kernel_L8_M4_22: +.Lstrmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L8_M4_22 + bgt .Lstrmm_kernel_L8_M4_22 -strmm_kernel_L8_M4_22a: +.Lstrmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b strmm_kernel_L8_M4_44 + b .Lstrmm_kernel_L8_M4_44 -strmm_kernel_L8_M4_32: +.Lstrmm_kernel_L8_M4_32: tst counterL, #1 - ble strmm_kernel_L8_M4_40 + ble .Lstrmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b strmm_kernel_L8_M4_44 + b .Lstrmm_kernel_L8_M4_44 -strmm_kernel_L8_M4_40: +.Lstrmm_kernel_L8_M4_40: INIT4x8 -strmm_kernel_L8_M4_44: +.Lstrmm_kernel_L8_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L8_M4_100 + ble .Lstrmm_kernel_L8_M4_100 -strmm_kernel_L8_M4_46: +.Lstrmm_kernel_L8_M4_46: KERNEL4x8_SUB -strmm_kernel_L8_M4_100: +.Lstrmm_kernel_L8_M4_100: SAVE4x8 @@ -1503,20 +1503,20 @@ strmm_kernel_L8_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L8_M4_END: +.Lstrmm_kernel_L8_M4_END: /******************************************************************************/ -strmm_kernel_L8_M2_BEGIN: +.Lstrmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L8_M1_BEGIN + ble .Lstrmm_kernel_L8_M1_BEGIN -strmm_kernel_L8_M2_20: +.Lstrmm_kernel_L8_M2_20: INIT2x8 @@ -1540,9 +1540,9 @@ strmm_kernel_L8_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L8_M2_40 + ble .Lstrmm_kernel_L8_M2_40 -strmm_kernel_L8_M2_22: +.Lstrmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1555,22 +1555,22 @@ strmm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M2_22 + bgt .Lstrmm_kernel_L8_M2_22 -strmm_kernel_L8_M2_40: +.Lstrmm_kernel_L8_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L8_M2_100 + ble .Lstrmm_kernel_L8_M2_100 -strmm_kernel_L8_M2_42: +.Lstrmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M2_42 + bgt .Lstrmm_kernel_L8_M2_42 -strmm_kernel_L8_M2_100: +.Lstrmm_kernel_L8_M2_100: SAVE2x8 @@ -1590,16 +1590,16 @@ strmm_kernel_L8_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L8_M2_END: +.Lstrmm_kernel_L8_M2_END: /******************************************************************************/ -strmm_kernel_L8_M1_BEGIN: +.Lstrmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END -strmm_kernel_L8_M1_20: +.Lstrmm_kernel_L8_M1_20: INIT1x8 @@ -1623,9 +1623,9 @@ strmm_kernel_L8_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L8_M1_40 + ble .Lstrmm_kernel_L8_M1_40 -strmm_kernel_L8_M1_22: +.Lstrmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1637,22 +1637,22 @@ strmm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M1_22 + bgt .Lstrmm_kernel_L8_M1_22 -strmm_kernel_L8_M1_40: +.Lstrmm_kernel_L8_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L8_M1_100 + ble .Lstrmm_kernel_L8_M1_100 -strmm_kernel_L8_M1_42: +.Lstrmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M1_42 + bgt .Lstrmm_kernel_L8_M1_42 -strmm_kernel_L8_M1_100: +.Lstrmm_kernel_L8_M1_100: SAVE1x8 @@ -1672,7 +1672,7 @@ strmm_kernel_L8_M1_100: add tempOffset, tempOffset, #1 #endif -strmm_kernel_L8_END: +.Lstrmm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp @@ -1681,19 +1681,19 @@ strmm_kernel_L8_END: #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L8_BEGIN + bgt .Lstrmm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #4 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1707,14 +1707,14 @@ strmm_kernel_L4_BEGIN: /******************************************************************************/ -strmm_kernel_L4_M8_BEGIN: +.Lstrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble strmm_kernel_L4_M4_BEGIN + ble .Lstrmm_kernel_L4_M4_BEGIN -strmm_kernel_L4_M8_20: +.Lstrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1736,54 +1736,54 @@ strmm_kernel_L4_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M8_32 + blt .Lstrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M8_22a + ble .Lstrmm_kernel_L4_M8_22a .align 5 -strmm_kernel_L4_M8_22: +.Lstrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M8_22 + bgt .Lstrmm_kernel_L4_M8_22 -strmm_kernel_L4_M8_22a: +.Lstrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_32: +.Lstrmm_kernel_L4_M8_32: tst counterL, #1 - ble strmm_kernel_L4_M8_40 + ble .Lstrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_40: +.Lstrmm_kernel_L4_M8_40: INIT8x4 -strmm_kernel_L4_M8_44: +.Lstrmm_kernel_L4_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M8_100 + ble .Lstrmm_kernel_L4_M8_100 -strmm_kernel_L4_M8_46: +.Lstrmm_kernel_L4_M8_46: KERNEL8x4_SUB -strmm_kernel_L4_M8_100: +.Lstrmm_kernel_L4_M8_100: SAVE8x4 @@ -1802,22 +1802,22 @@ strmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L4_M8_END: +.Lstrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M8_20 + bne .Lstrmm_kernel_L4_M8_20 /******************************************************************************/ -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #4 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1837,54 +1837,54 @@ strmm_kernel_L4_M4_20: #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -1902,20 +1902,20 @@ strmm_kernel_L4_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: /******************************************************************************/ -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -1938,9 +1938,9 @@ strmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1953,22 +1953,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -1987,16 +1987,16 @@ strmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: /******************************************************************************/ -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -2019,9 +2019,9 @@ strmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -2033,22 +2033,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -2067,7 +2067,7 @@ strmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 @@ -2076,14 +2076,14 @@ strmm_kernel_L4_END: /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -2096,14 +2096,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction /******************************************************************************/ -strmm_kernel_L2_M8_BEGIN: +.Lstrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 - ble strmm_kernel_L2_M4_BEGIN + ble .Lstrmm_kernel_L2_M4_BEGIN -strmm_kernel_L2_M8_20: +.Lstrmm_kernel_L2_M8_20: INIT8x2 @@ -2126,10 +2126,10 @@ strmm_kernel_L2_M8_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M8_40 + ble .Lstrmm_kernel_L2_M8_40 .align 5 -strmm_kernel_L2_M8_22: +.Lstrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -2141,22 +2141,22 @@ strmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_22 + bgt .Lstrmm_kernel_L2_M8_22 -strmm_kernel_L2_M8_40: +.Lstrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M8_100 + ble .Lstrmm_kernel_L2_M8_100 -strmm_kernel_L2_M8_42: +.Lstrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_42 + bgt .Lstrmm_kernel_L2_M8_42 -strmm_kernel_L2_M8_100: +.Lstrmm_kernel_L2_M8_100: SAVE8x2 @@ -2175,23 +2175,23 @@ strmm_kernel_L2_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L2_M8_END: +.Lstrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M8_20 + bgt .Lstrmm_kernel_L2_M8_20 /******************************************************************************/ -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #4 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -2214,10 +2214,10 @@ strmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -2229,22 +2229,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -2263,20 +2263,20 @@ strmm_kernel_L2_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: /******************************************************************************/ -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -2299,9 +2299,9 @@ strmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -2314,22 +2314,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2348,16 +2348,16 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: /******************************************************************************/ -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -2380,9 +2380,9 @@ strmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2394,22 +2394,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -2428,7 +2428,7 @@ strmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2437,11 +2437,11 @@ strmm_kernel_L2_END: /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2454,14 +2454,14 @@ strmm_kernel_L1_BEGIN: /******************************************************************************/ -strmm_kernel_L1_M8_BEGIN: +.Lstrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 - ble strmm_kernel_L1_M4_BEGIN + ble .Lstrmm_kernel_L1_M4_BEGIN -strmm_kernel_L1_M8_20: +.Lstrmm_kernel_L1_M8_20: INIT8x1 @@ -2484,10 +2484,10 @@ strmm_kernel_L1_M8_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M8_40 + ble .Lstrmm_kernel_L1_M8_40 .align 5 -strmm_kernel_L1_M8_22: +.Lstrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2499,22 +2499,22 @@ strmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_22 + bgt .Lstrmm_kernel_L1_M8_22 -strmm_kernel_L1_M8_40: +.Lstrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M8_100 + ble .Lstrmm_kernel_L1_M8_100 -strmm_kernel_L1_M8_42: +.Lstrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_42 + bgt .Lstrmm_kernel_L1_M8_42 -strmm_kernel_L1_M8_100: +.Lstrmm_kernel_L1_M8_100: SAVE8x1 @@ -2533,23 +2533,23 @@ strmm_kernel_L1_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L1_M8_END: +.Lstrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M8_20 + bgt .Lstrmm_kernel_L1_M8_20 /******************************************************************************/ -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #4 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -2572,10 +2572,10 @@ strmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2587,22 +2587,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -2621,20 +2621,20 @@ strmm_kernel_L1_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: /******************************************************************************/ -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -2657,9 +2657,9 @@ strmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2672,22 +2672,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -2706,16 +2706,16 @@ strmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: /******************************************************************************/ -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -2738,9 +2738,9 @@ strmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2752,30 +2752,30 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: /******************************************************************************/ -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/swap.S b/kernel/arm64/swap.S index 37ed83f2a..184e02e9c 100644 --- a/kernel/arm64/swap.S +++ b/kernel/arm64/swap.S @@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble swap_kernel_L999 + ble .Lswap_kernel_L999 cmp INC_X, #1 - bne swap_kernel_S_BEGIN + bne .Lswap_kernel_S_BEGIN cmp INC_Y, #1 - bne swap_kernel_S_BEGIN + bne .Lswap_kernel_S_BEGIN -swap_kernel_F_BEGIN: +.Lswap_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq swap_kernel_F1 + beq .Lswap_kernel_F1 -swap_kernel_F8: +.Lswap_kernel_F8: KERNEL_F8 subs I, I, #1 - bne swap_kernel_F8 + bne .Lswap_kernel_F8 -swap_kernel_F1: +.Lswap_kernel_F1: ands I, N, #7 - ble swap_kernel_L999 + ble .Lswap_kernel_L999 -swap_kernel_F10: +.Lswap_kernel_F10: KERNEL_F1 subs I, I, #1 - bne swap_kernel_F10 + bne .Lswap_kernel_F10 - b swap_kernel_L999 + b .Lswap_kernel_L999 -swap_kernel_S_BEGIN: +.Lswap_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble swap_kernel_S1 + ble .Lswap_kernel_S1 -swap_kernel_S4: +.Lswap_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -244,21 +244,21 @@ swap_kernel_S4: KERNEL_S1 subs I, I, #1 - bne swap_kernel_S4 + bne .Lswap_kernel_S4 -swap_kernel_S1: +.Lswap_kernel_S1: ands I, N, #3 - ble swap_kernel_L999 + ble .Lswap_kernel_L999 -swap_kernel_S10: +.Lswap_kernel_S10: KERNEL_S1 subs I, I, #1 - bne swap_kernel_S10 + bne .Lswap_kernel_S10 -swap_kernel_L999: +.Lswap_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zamax.S b/kernel/arm64/zamax.S index 7db339f53..c2c0a5374 100644 --- a/kernel/arm64/zamax.S +++ b/kernel/arm64/zamax.S @@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble amax_kernel_zero + ble .Lzamax_kernel_zero cmp INC_X, xzr - ble amax_kernel_zero + ble .Lzamax_kernel_zero cmp INC_X, #1 - bne amax_kernel_S_BEGIN + bne .Lzamax_kernel_S_BEGIN -amax_kernel_F_BEGIN: +.Lzamax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq amax_kernel_F1_INIT + beq .Lzamax_kernel_F1_INIT INIT_F4 subs I, I, #1 - beq amax_kernel_F1 + beq .Lzamax_kernel_F1 -amax_kernel_F4: +.Lzamax_kernel_F4: KERNEL_F4 subs I, I, #1 - bne amax_kernel_F4 + bne .Lzamax_kernel_F4 -amax_kernel_F1: +.Lzamax_kernel_F1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 -amax_kernel_F10: +.Lzamax_kernel_F10: KERNEL_F1 subs I, I, #1 - bne amax_kernel_F10 + bne .Lzamax_kernel_F10 ret -amax_kernel_F1_INIT: +.Lzamax_kernel_F1_INIT: INIT_F1 subs N, N, #1 - b amax_kernel_F1 + b .Lzamax_kernel_F1 -amax_kernel_S_BEGIN: +.Lzamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble amax_kernel_S1 + ble .Lzamax_kernel_S1 -amax_kernel_S4: +.Lzamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -247,25 +247,25 @@ amax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S4 + bne .Lzamax_kernel_S4 -amax_kernel_S1: +.Lzamax_kernel_S1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 -amax_kernel_S10: +.Lzamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S10 + bne .Lzamax_kernel_S10 -amax_kernel_L999: +.Lzamax_kernel_L999: ret -amax_kernel_zero: +.Lzamax_kernel_zero: fmov MAXF, REG0 ret diff --git a/kernel/arm64/zasum.S b/kernel/arm64/zasum.S index bf586d367..0d5ec952b 100644 --- a/kernel/arm64/zasum.S +++ b/kernel/arm64/zasum.S @@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov SUMF, REG0 cmp N, xzr - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lzasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lzasum_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq asum_kernel_F1 + beq .Lzasum_kernel_F1 -asum_kernel_F4: +.Lzasum_kernel_F4: KERNEL_F4 subs I, I, #1 - bne asum_kernel_F4 + bne .Lzasum_kernel_F4 KERNEL_F4_FINALIZE -asum_kernel_F1: +.Lzasum_kernel_F1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 -asum_kernel_F10: +.Lzasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lzasum_kernel_F10 -asum_kernel_L999: +.Lzasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lzasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lzasum_kernel_S1 -asum_kernel_S4: +.Lzasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -145,19 +145,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lzasum_kernel_S4 -asum_kernel_S1: +.Lzasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 -asum_kernel_S10: +.Lzasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lzasum_kernel_S10 ret diff --git a/kernel/arm64/zaxpy.S b/kernel/arm64/zaxpy.S index 70c249981..46d7b0478 100644 --- a/kernel/arm64/zaxpy.S +++ b/kernel/arm64/zaxpy.S @@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 mov Y_COPY, Y fcmp DA_R, #0.0 bne .L1 fcmp DA_I, #0.0 - beq zaxpy_kernel_L999 + beq .Lzaxpy_kernel_L999 .L1: INIT cmp INC_X, #1 - bne zaxpy_kernel_S_BEGIN + bne .Lzaxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne zaxpy_kernel_S_BEGIN + bne .Lzaxpy_kernel_S_BEGIN -zaxpy_kernel_F_BEGIN: +.Lzaxpy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq zaxpy_kernel_F1 + beq .Lzaxpy_kernel_F1 KERNEL_INIT_F4 -zaxpy_kernel_F4: +.Lzaxpy_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zaxpy_kernel_F4 + bne .Lzaxpy_kernel_F4 -zaxpy_kernel_F1: +.Lzaxpy_kernel_F1: ands I, N, #3 - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 -zaxpy_kernel_F10: +.Lzaxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zaxpy_kernel_F10 + bne .Lzaxpy_kernel_F10 mov w0, wzr ret -zaxpy_kernel_S_BEGIN: +.Lzaxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble zaxpy_kernel_S1 + ble .Lzaxpy_kernel_S1 -zaxpy_kernel_S4: +.Lzaxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -304,21 +304,21 @@ zaxpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zaxpy_kernel_S4 + bne .Lzaxpy_kernel_S4 -zaxpy_kernel_S1: +.Lzaxpy_kernel_S1: ands I, N, #3 - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 -zaxpy_kernel_S10: +.Lzaxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zaxpy_kernel_S10 + bne .Lzaxpy_kernel_S10 -zaxpy_kernel_L999: +.Lzaxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zdot.S b/kernel/arm64/zdot.S index 3e8e3d7d9..044ace3b8 100644 --- a/kernel/arm64/zdot.S +++ b/kernel/arm64/zdot.S @@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 cmp INC_X, #1 - bne dot_kernel_S_BEGIN + bne .Lzdot_kernel_S_BEGIN cmp INC_Y, #1 - bne dot_kernel_S_BEGIN + bne .Lzdot_kernel_S_BEGIN -dot_kernel_F_BEGIN: +.Lzdot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq dot_kernel_F1 + beq .Lzdot_kernel_F1 -dot_kernel_F4: +.Lzdot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne dot_kernel_F4 + bne .Lzdot_kernel_F4 KERNEL_F4_FINALIZE -dot_kernel_F1: +.Lzdot_kernel_F1: ands I, N, #3 - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 -dot_kernel_F10: +.Lzdot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne dot_kernel_F10 + bne .Lzdot_kernel_F10 ret -dot_kernel_S_BEGIN: +.Lzdot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble dot_kernel_S1 + ble .Lzdot_kernel_S1 -dot_kernel_S4: +.Lzdot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -281,21 +281,21 @@ dot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S4 + bne .Lzdot_kernel_S4 -dot_kernel_S1: +.Lzdot_kernel_S1: ands I, N, #3 - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 -dot_kernel_S10: +.Lzdot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S10 + bne .Lzdot_kernel_S10 -dot_kernel_L999: +.Lzdot_kernel_L999: ret diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 08a1531cf..f8e877f3c 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble zgemm_kernel_L2_BEGIN + ble .Lzgemm_kernel_L2_BEGIN -zgemm_kernel_L4_BEGIN: +.Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -zgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L4_M2_BEGIN + ble .Lzgemm_kernel_L4_M2_BEGIN .align 5 -zgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt zgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 - ble zgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_M4_22a .align 5 -zgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_M4_22 .align 5 -zgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 .align 5 -zgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_M4_32: tst counterL, #1 - ble zgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 -zgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_M4_40: INIT4x4 -zgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_M4_44: ands counterL , origK, #7 - ble zgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_M4_100 .align 5 -zgemm_kernel_L4_M4_46: +.Lzgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne zgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_M4_46 -zgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 -zgemm_kernel_L4_M4_END: +.Lzgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne zgemm_kernel_L4_M4_20 + bne .Lzgemm_kernel_L4_M4_20 -zgemm_kernel_L4_M2_BEGIN: +.Lzgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L4_M1_BEGIN + ble .Lzgemm_kernel_L4_M1_BEGIN -zgemm_kernel_L4_M2_20: +.Lzgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M2_40 + ble .Lzgemm_kernel_L4_M2_40 -zgemm_kernel_L4_M2_22: +.Lzgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_22 + bgt .Lzgemm_kernel_L4_M2_22 -zgemm_kernel_L4_M2_40: +.Lzgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M2_100 + ble .Lzgemm_kernel_L4_M2_100 -zgemm_kernel_L4_M2_42: +.Lzgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_42 + bgt .Lzgemm_kernel_L4_M2_42 -zgemm_kernel_L4_M2_100: +.Lzgemm_kernel_L4_M2_100: SAVE2x4 -zgemm_kernel_L4_M2_END: +.Lzgemm_kernel_L4_M2_END: -zgemm_kernel_L4_M1_BEGIN: +.Lzgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END -zgemm_kernel_L4_M1_20: +.Lzgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M1_40 + ble .Lzgemm_kernel_L4_M1_40 -zgemm_kernel_L4_M1_22: +.Lzgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_22 + bgt .Lzgemm_kernel_L4_M1_22 -zgemm_kernel_L4_M1_40: +.Lzgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M1_100 + ble .Lzgemm_kernel_L4_M1_100 -zgemm_kernel_L4_M1_42: +.Lzgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_42 + bgt .Lzgemm_kernel_L4_M1_42 -zgemm_kernel_L4_M1_100: +.Lzgemm_kernel_L4_M1_100: SAVE1x4 -zgemm_kernel_L4_END: +.Lzgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- - bgt zgemm_kernel_L4_BEGIN + bgt .Lzgemm_kernel_L4_BEGIN /******************************************************************************/ -zgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble zgemm_kernel_L999 + ble .Lzgemm_kernel_L999 tst counterJ , #2 - ble zgemm_kernel_L1_BEGIN + ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction -zgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble zgemm_kernel_L2_M2_BEGIN + ble .Lzgemm_kernel_L2_M2_BEGIN -zgemm_kernel_L2_M4_20: +.Lzgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_M4_40 .align 5 -zgemm_kernel_L2_M4_22: +.Lzgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_M4_22 -zgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M4_100 + ble .Lzgemm_kernel_L2_M4_100 -zgemm_kernel_L2_M4_42: +.Lzgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_42 + bgt .Lzgemm_kernel_L2_M4_42 -zgemm_kernel_L2_M4_100: +.Lzgemm_kernel_L2_M4_100: SAVE4x2 -zgemm_kernel_L2_M4_END: +.Lzgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L2_M4_20 + bgt .Lzgemm_kernel_L2_M4_20 -zgemm_kernel_L2_M2_BEGIN: +.Lzgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L2_M1_BEGIN + ble .Lzgemm_kernel_L2_M1_BEGIN -zgemm_kernel_L2_M2_20: +.Lzgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M2_40 + ble .Lzgemm_kernel_L2_M2_40 -zgemm_kernel_L2_M2_22: +.Lzgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_22 + bgt .Lzgemm_kernel_L2_M2_22 -zgemm_kernel_L2_M2_40: +.Lzgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M2_100 + ble .Lzgemm_kernel_L2_M2_100 -zgemm_kernel_L2_M2_42: +.Lzgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_42 + bgt .Lzgemm_kernel_L2_M2_42 -zgemm_kernel_L2_M2_100: +.Lzgemm_kernel_L2_M2_100: SAVE2x2 -zgemm_kernel_L2_M2_END: +.Lzgemm_kernel_L2_M2_END: -zgemm_kernel_L2_M1_BEGIN: +.Lzgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END -zgemm_kernel_L2_M1_20: +.Lzgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble zgemm_kernel_L2_M1_40 + ble .Lzgemm_kernel_L2_M1_40 -zgemm_kernel_L2_M1_22: +.Lzgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_22 + bgt .Lzgemm_kernel_L2_M1_22 -zgemm_kernel_L2_M1_40: +.Lzgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M1_100 + ble .Lzgemm_kernel_L2_M1_100 -zgemm_kernel_L2_M1_42: +.Lzgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_42 + bgt .Lzgemm_kernel_L2_M1_42 -zgemm_kernel_L2_M1_100: +.Lzgemm_kernel_L2_M1_100: SAVE1x2 -zgemm_kernel_L2_END: +.Lzgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ -zgemm_kernel_L1_BEGIN: +.Lzgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble zgemm_kernel_L999 // done + ble .Lzgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN: -zgemm_kernel_L1_M4_BEGIN: +.Lzgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L1_M2_BEGIN + ble .Lzgemm_kernel_L1_M2_BEGIN -zgemm_kernel_L1_M4_20: +.Lzgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_M4_40 .align 5 -zgemm_kernel_L1_M4_22: +.Lzgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_M4_22 -zgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_M4_100 -zgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_42 + bgt .Lzgemm_kernel_L1_M4_42 -zgemm_kernel_L1_M4_100: +.Lzgemm_kernel_L1_M4_100: SAVE4x1 -zgemm_kernel_L1_M4_END: +.Lzgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L1_M4_20 + bgt .Lzgemm_kernel_L1_M4_20 -zgemm_kernel_L1_M2_BEGIN: +.Lzgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L1_M1_BEGIN + ble .Lzgemm_kernel_L1_M1_BEGIN -zgemm_kernel_L1_M2_20: +.Lzgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M2_40 + ble .Lzgemm_kernel_L1_M2_40 -zgemm_kernel_L1_M2_22: +.Lzgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_22 + bgt .Lzgemm_kernel_L1_M2_22 -zgemm_kernel_L1_M2_40: +.Lzgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M2_100 + ble .Lzgemm_kernel_L1_M2_100 -zgemm_kernel_L1_M2_42: +.Lzgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_42 + bgt .Lzgemm_kernel_L1_M2_42 -zgemm_kernel_L1_M2_100: +.Lzgemm_kernel_L1_M2_100: SAVE2x1 -zgemm_kernel_L1_M2_END: +.Lzgemm_kernel_L1_M2_END: -zgemm_kernel_L1_M1_BEGIN: +.Lzgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END -zgemm_kernel_L1_M1_20: +.Lzgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M1_40 + ble .Lzgemm_kernel_L1_M1_40 -zgemm_kernel_L1_M1_22: +.Lzgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_22 + bgt .Lzgemm_kernel_L1_M1_22 -zgemm_kernel_L1_M1_40: +.Lzgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M1_100 + ble .Lzgemm_kernel_L1_M1_100 -zgemm_kernel_L1_M1_42: +.Lzgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_42 + bgt .Lzgemm_kernel_L1_M1_42 -zgemm_kernel_L1_M1_100: +.Lzgemm_kernel_L1_M1_100: SAVE1x1 -zgemm_kernel_L1_END: +.Lzgemm_kernel_L1_END: -zgemm_kernel_L999: +.Lzgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S b/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S index e5b4cba9c..8e6ff655d 100644 --- a/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S +++ b/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S @@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble zgemm_kernel_L2_BEGIN + ble .Lzgemm_kernel_L2_BEGIN -zgemm_kernel_L4_BEGIN: +.Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -zgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L4_M2_BEGIN + ble .Lzgemm_kernel_L4_M2_BEGIN .align 5 -zgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt zgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 - ble zgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_M4_22a .align 5 -zgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_M4_22 .align 5 -zgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 .align 5 -zgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_M4_32: tst counterL, #1 - ble zgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 -zgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_M4_40: INIT4x4 -zgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_M4_44: ands counterL , origK, #7 - ble zgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_M4_100 .align 5 -zgemm_kernel_L4_M4_46: +.Lzgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne zgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_M4_46 -zgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 -zgemm_kernel_L4_M4_END: +.Lzgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne zgemm_kernel_L4_M4_20 + bne .Lzgemm_kernel_L4_M4_20 -zgemm_kernel_L4_M2_BEGIN: +.Lzgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L4_M1_BEGIN + ble .Lzgemm_kernel_L4_M1_BEGIN -zgemm_kernel_L4_M2_20: +.Lzgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M2_40 + ble .Lzgemm_kernel_L4_M2_40 -zgemm_kernel_L4_M2_22: +.Lzgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_22 + bgt .Lzgemm_kernel_L4_M2_22 -zgemm_kernel_L4_M2_40: +.Lzgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M2_100 + ble .Lzgemm_kernel_L4_M2_100 -zgemm_kernel_L4_M2_42: +.Lzgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_42 + bgt .Lzgemm_kernel_L4_M2_42 -zgemm_kernel_L4_M2_100: +.Lzgemm_kernel_L4_M2_100: SAVE2x4 -zgemm_kernel_L4_M2_END: +.Lzgemm_kernel_L4_M2_END: -zgemm_kernel_L4_M1_BEGIN: +.Lzgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END -zgemm_kernel_L4_M1_20: +.Lzgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M1_40 + ble .Lzgemm_kernel_L4_M1_40 -zgemm_kernel_L4_M1_22: +.Lzgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_22 + bgt .Lzgemm_kernel_L4_M1_22 -zgemm_kernel_L4_M1_40: +.Lzgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M1_100 + ble .Lzgemm_kernel_L4_M1_100 -zgemm_kernel_L4_M1_42: +.Lzgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_42 + bgt .Lzgemm_kernel_L4_M1_42 -zgemm_kernel_L4_M1_100: +.Lzgemm_kernel_L4_M1_100: SAVE1x4 -zgemm_kernel_L4_END: +.Lzgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- - bgt zgemm_kernel_L4_BEGIN + bgt .Lzgemm_kernel_L4_BEGIN /******************************************************************************/ -zgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble zgemm_kernel_L999 + ble .Lzgemm_kernel_L999 tst counterJ , #2 - ble zgemm_kernel_L1_BEGIN + ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction -zgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble zgemm_kernel_L2_M2_BEGIN + ble .Lzgemm_kernel_L2_M2_BEGIN -zgemm_kernel_L2_M4_20: +.Lzgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_M4_40 .align 5 -zgemm_kernel_L2_M4_22: +.Lzgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_M4_22 -zgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M4_100 + ble .Lzgemm_kernel_L2_M4_100 -zgemm_kernel_L2_M4_42: +.Lzgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_42 + bgt .Lzgemm_kernel_L2_M4_42 -zgemm_kernel_L2_M4_100: +.Lzgemm_kernel_L2_M4_100: SAVE4x2 -zgemm_kernel_L2_M4_END: +.Lzgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L2_M4_20 + bgt .Lzgemm_kernel_L2_M4_20 -zgemm_kernel_L2_M2_BEGIN: +.Lzgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L2_M1_BEGIN + ble .Lzgemm_kernel_L2_M1_BEGIN -zgemm_kernel_L2_M2_20: +.Lzgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M2_40 + ble .Lzgemm_kernel_L2_M2_40 -zgemm_kernel_L2_M2_22: +.Lzgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_22 + bgt .Lzgemm_kernel_L2_M2_22 -zgemm_kernel_L2_M2_40: +.Lzgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M2_100 + ble .Lzgemm_kernel_L2_M2_100 -zgemm_kernel_L2_M2_42: +.Lzgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_42 + bgt .Lzgemm_kernel_L2_M2_42 -zgemm_kernel_L2_M2_100: +.Lzgemm_kernel_L2_M2_100: SAVE2x2 -zgemm_kernel_L2_M2_END: +.Lzgemm_kernel_L2_M2_END: -zgemm_kernel_L2_M1_BEGIN: +.Lzgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END -zgemm_kernel_L2_M1_20: +.Lzgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble zgemm_kernel_L2_M1_40 + ble .Lzgemm_kernel_L2_M1_40 -zgemm_kernel_L2_M1_22: +.Lzgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_22 + bgt .Lzgemm_kernel_L2_M1_22 -zgemm_kernel_L2_M1_40: +.Lzgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M1_100 + ble .Lzgemm_kernel_L2_M1_100 -zgemm_kernel_L2_M1_42: +.Lzgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_42 + bgt .Lzgemm_kernel_L2_M1_42 -zgemm_kernel_L2_M1_100: +.Lzgemm_kernel_L2_M1_100: SAVE1x2 -zgemm_kernel_L2_END: +.Lzgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ -zgemm_kernel_L1_BEGIN: +.Lzgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble zgemm_kernel_L999 // done + ble .Lzgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN: -zgemm_kernel_L1_M4_BEGIN: +.Lzgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L1_M2_BEGIN + ble .Lzgemm_kernel_L1_M2_BEGIN -zgemm_kernel_L1_M4_20: +.Lzgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_M4_40 .align 5 -zgemm_kernel_L1_M4_22: +.Lzgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_M4_22 -zgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_M4_100 -zgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_42 + bgt .Lzgemm_kernel_L1_M4_42 -zgemm_kernel_L1_M4_100: +.Lzgemm_kernel_L1_M4_100: SAVE4x1 -zgemm_kernel_L1_M4_END: +.Lzgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L1_M4_20 + bgt .Lzgemm_kernel_L1_M4_20 -zgemm_kernel_L1_M2_BEGIN: +.Lzgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L1_M1_BEGIN + ble .Lzgemm_kernel_L1_M1_BEGIN -zgemm_kernel_L1_M2_20: +.Lzgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M2_40 + ble .Lzgemm_kernel_L1_M2_40 -zgemm_kernel_L1_M2_22: +.Lzgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_22 + bgt .Lzgemm_kernel_L1_M2_22 -zgemm_kernel_L1_M2_40: +.Lzgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M2_100 + ble .Lzgemm_kernel_L1_M2_100 -zgemm_kernel_L1_M2_42: +.Lzgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_42 + bgt .Lzgemm_kernel_L1_M2_42 -zgemm_kernel_L1_M2_100: +.Lzgemm_kernel_L1_M2_100: SAVE2x1 -zgemm_kernel_L1_M2_END: +.Lzgemm_kernel_L1_M2_END: -zgemm_kernel_L1_M1_BEGIN: +.Lzgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END -zgemm_kernel_L1_M1_20: +.Lzgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M1_40 + ble .Lzgemm_kernel_L1_M1_40 -zgemm_kernel_L1_M1_22: +.Lzgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_22 + bgt .Lzgemm_kernel_L1_M1_22 -zgemm_kernel_L1_M1_40: +.Lzgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M1_100 + ble .Lzgemm_kernel_L1_M1_100 -zgemm_kernel_L1_M1_42: +.Lzgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_42 + bgt .Lzgemm_kernel_L1_M1_42 -zgemm_kernel_L1_M1_100: +.Lzgemm_kernel_L1_M1_100: SAVE1x1 -zgemm_kernel_L1_END: +.Lzgemm_kernel_L1_END: -zgemm_kernel_L999: +.Lzgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index a28d1b0ce..28afcada5 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble zgemv_n_kernel_L999 + ble .Lzgemv_n_kernel_L999 cmp M, xzr - ble zgemv_n_kernel_L999 + ble .Lzgemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ @@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INIT cmp INC_Y, #1 - bne zgemv_n_kernel_S_BEGIN + bne .Lzgemv_n_kernel_S_BEGIN -zgemv_n_kernel_F_LOOP: +.Lzgemv_n_kernel_F_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y @@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP: asr I, M, #2 cmp I, xzr - beq zgemv_n_kernel_F1 + beq .Lzgemv_n_kernel_F1 -zgemv_n_kernel_F4: +.Lzgemv_n_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zgemv_n_kernel_F4 + bne .Lzgemv_n_kernel_F4 -zgemv_n_kernel_F1: +.Lzgemv_n_kernel_F1: ands I, M, #3 - ble zgemv_n_kernel_F_END + ble .Lzgemv_n_kernel_F_END -zgemv_n_kernel_F10: +.Lzgemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zgemv_n_kernel_F10 + bne .Lzgemv_n_kernel_F10 -zgemv_n_kernel_F_END: +.Lzgemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 - bne zgemv_n_kernel_F_LOOP + bne .Lzgemv_n_kernel_F_LOOP - b zgemv_n_kernel_L999 + b .Lzgemv_n_kernel_L999 -zgemv_n_kernel_S_BEGIN: +.Lzgemv_n_kernel_S_BEGIN: INIT_S -zgemv_n_kernel_S_LOOP: +.Lzgemv_n_kernel_S_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y @@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble zgemv_n_kernel_S1 + ble .Lzgemv_n_kernel_S1 -zgemv_n_kernel_S4: +.Lzgemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -440,27 +440,27 @@ zgemv_n_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zgemv_n_kernel_S4 + bne .Lzgemv_n_kernel_S4 -zgemv_n_kernel_S1: +.Lzgemv_n_kernel_S1: ands I, M, #3 - ble zgemv_n_kernel_S_END + ble .Lzgemv_n_kernel_S_END -zgemv_n_kernel_S10: +.Lzgemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zgemv_n_kernel_S10 + bne .Lzgemv_n_kernel_S10 -zgemv_n_kernel_S_END: +.Lzgemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 - bne zgemv_n_kernel_S_LOOP + bne .Lzgemv_n_kernel_S_LOOP -zgemv_n_kernel_L999: +.Lzgemv_n_kernel_L999: RESTORE_REGS mov w0, wzr diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index 79ce9bcf2..0151029c7 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble zgemv_t_kernel_L999 + ble .Lzgemv_t_kernel_L999 cmp M, xzr - ble zgemv_t_kernel_L999 + ble .Lzgemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ @@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INIT cmp INC_X, #1 - bne zgemv_t_kernel_S_BEGIN + bne .Lzgemv_t_kernel_S_BEGIN -zgemv_t_kernel_F_LOOP: +.Lzgemv_t_kernel_F_LOOP: mov A_PTR, A mov X_PTR, X @@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP: asr I, M, #2 cmp I, xzr - beq zgemv_t_kernel_F1 + beq .Lzgemv_t_kernel_F1 -zgemv_t_kernel_F4: +.Lzgemv_t_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zgemv_t_kernel_F4 + bne .Lzgemv_t_kernel_F4 KERNEL_F4_FINALIZE -zgemv_t_kernel_F1: +.Lzgemv_t_kernel_F1: ands I, M, #3 - ble zgemv_t_kernel_F_END + ble .Lzgemv_t_kernel_F_END -zgemv_t_kernel_F10: +.Lzgemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zgemv_t_kernel_F10 + bne .Lzgemv_t_kernel_F10 -zgemv_t_kernel_F_END: +.Lzgemv_t_kernel_F_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] @@ -355,15 +355,15 @@ zgemv_t_kernel_F_END: add A, A, LDA subs J, J, #1 - bne zgemv_t_kernel_F_LOOP + bne .Lzgemv_t_kernel_F_LOOP - b zgemv_t_kernel_L999 + b .Lzgemv_t_kernel_L999 -zgemv_t_kernel_S_BEGIN: +.Lzgemv_t_kernel_S_BEGIN: INIT_S -zgemv_t_kernel_S_LOOP: +.Lzgemv_t_kernel_S_LOOP: mov A_PTR, A mov X_PTR, X @@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble zgemv_t_kernel_S1 + ble .Lzgemv_t_kernel_S1 -zgemv_t_kernel_S4: +.Lzgemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -381,21 +381,21 @@ zgemv_t_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zgemv_t_kernel_S4 + bne .Lzgemv_t_kernel_S4 -zgemv_t_kernel_S1: +.Lzgemv_t_kernel_S1: ands I, M, #3 - ble zgemv_t_kernel_S_END + ble .Lzgemv_t_kernel_S_END -zgemv_t_kernel_S10: +.Lzgemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zgemv_t_kernel_S10 + bne .Lzgemv_t_kernel_S10 -zgemv_t_kernel_S_END: +.Lzgemv_t_kernel_S_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] @@ -413,9 +413,9 @@ zgemv_t_kernel_S_END: add A, A, LDA subs J, J, #1 - bne zgemv_t_kernel_S_LOOP + bne .Lzgemv_t_kernel_S_LOOP -zgemv_t_kernel_L999: +.Lzgemv_t_kernel_L999: RESTORE_REGS mov w0, wzr ret diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S index 1360dc993..1c89685ea 100644 --- a/kernel/arm64/znrm2.S +++ b/kernel/arm64/znrm2.S @@ -226,43 +226,43 @@ KERNEL_S1_END_\@: INIT cmp N, #0 - ble nrm2_kernel_L999 + ble .Lznrm2_kernel_L999 cmp INC_X, #0 - beq nrm2_kernel_L999 + beq .Lznrm2_kernel_L999 cmp INC_X, #1 - bne nrm2_kernel_S_BEGIN + bne .Lznrm2_kernel_S_BEGIN -nrm2_kernel_F_BEGIN: +.Lznrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr - ble nrm2_kernel_F1 + ble .Lznrm2_kernel_F1 -nrm2_kernel_F8: +.Lznrm2_kernel_F8: KERNEL_F8 subs I, I, #1 - bne nrm2_kernel_F8 + bne .Lznrm2_kernel_F8 -nrm2_kernel_F1: +.Lznrm2_kernel_F1: ands I, N, #7 - ble nrm2_kernel_L999 + ble .Lznrm2_kernel_L999 -nrm2_kernel_F10: +.Lznrm2_kernel_F10: KERNEL_F1 subs I, I, #1 - bne nrm2_kernel_F10 + bne .Lznrm2_kernel_F10 - b nrm2_kernel_L999 + b .Lznrm2_kernel_L999 -nrm2_kernel_S_BEGIN: +.Lznrm2_kernel_S_BEGIN: INIT_S @@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN: .align 5 -nrm2_kernel_S10: +.Lznrm2_kernel_S10: KERNEL_S1 subs I, I, #1 - bne nrm2_kernel_S10 + bne .Lznrm2_kernel_S10 -nrm2_kernel_L999: +.Lznrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ diff --git a/kernel/arm64/zrot.S b/kernel/arm64/zrot.S index 90f138a19..b5e510ebe 100644 --- a/kernel/arm64/zrot.S +++ b/kernel/arm64/zrot.S @@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 INIT cmp INC_X, #1 - bne rot_kernel_S_BEGIN + bne .Lzrot_kernel_S_BEGIN cmp INC_Y, #1 - bne rot_kernel_S_BEGIN + bne .Lzrot_kernel_S_BEGIN -rot_kernel_F_BEGIN: +.Lzrot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq rot_kernel_F1 + beq .Lzrot_kernel_F1 KERNEL_INIT_F4 -rot_kernel_F4: +.Lzrot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne rot_kernel_F4 + bne .Lzrot_kernel_F4 -rot_kernel_F1: +.Lzrot_kernel_F1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 -rot_kernel_F10: +.Lzrot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne rot_kernel_F10 + bne .Lzrot_kernel_F10 mov w0, wzr ret -rot_kernel_S_BEGIN: +.Lzrot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble rot_kernel_S1 + ble .Lzrot_kernel_S1 -rot_kernel_S4: +.Lzrot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -236,21 +236,21 @@ rot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S4 + bne .Lzrot_kernel_S4 -rot_kernel_S1: +.Lzrot_kernel_S1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 -rot_kernel_S10: +.Lzrot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S10 + bne .Lzrot_kernel_S10 -rot_kernel_L999: +.Lzrot_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index daaa55e9d..929455975 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -215,71 +215,71 @@ zscal_begin: mov X_COPY, X cmp N, xzr - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 fcmp DA_R, #0.0 - bne zscal_kernel_R_non_zero + bne .Lzscal_kernel_R_non_zero fcmp DA_I, #0.0 - beq zscal_kernel_RI_zero + beq .Lzscal_kernel_RI_zero - b zscal_kernel_R_zero + b .Lzscal_kernel_R_zero -zscal_kernel_R_non_zero: +.Lzscal_kernel_R_non_zero: fcmp DA_I, #0.0 - beq zscal_kernel_I_zero + beq .Lzscal_kernel_I_zero /******************************************************************************* * A_R != 0 && A_I != 0 *******************************************************************************/ -zscal_kernel_RI_non_zero: +.Lzscal_kernel_RI_non_zero: INIT cmp INC_X, #1 - bne zscal_kernel_S_BEGIN + bne .Lzscal_kernel_S_BEGIN -zscal_kernel_F_BEGIN: +.Lzscal_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq zscal_kernel_F1 + beq .Lzscal_kernel_F1 KERNEL_INIT_F4 -zscal_kernel_F4: +.Lzscal_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zscal_kernel_F4 + bne .Lzscal_kernel_F4 -zscal_kernel_F1: +.Lzscal_kernel_F1: ands I, N, #3 - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 -zscal_kernel_F10: +.Lzscal_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zscal_kernel_F10 + bne .Lzscal_kernel_F10 mov w0, wzr ret -zscal_kernel_S_BEGIN: +.Lzscal_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble zscal_kernel_S1 + ble .Lzscal_kernel_S1 -zscal_kernel_S4: +.Lzscal_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -287,21 +287,21 @@ zscal_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zscal_kernel_S4 + bne .Lzscal_kernel_S4 -zscal_kernel_S1: +.Lzscal_kernel_S1: ands I, N, #3 - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 -zscal_kernel_S10: +.Lzscal_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zscal_kernel_S10 + bne .Lzscal_kernel_S10 -zscal_kernel_L999: +.Lzscal_kernel_L999: mov w0, wzr ret @@ -310,7 +310,7 @@ zscal_kernel_L999: * A_R == 0 && A_I != 0 *******************************************************************************/ -zscal_kernel_R_zero: +.Lzscal_kernel_R_zero: INIT_S #if !defined(DOUBLE) @@ -323,7 +323,7 @@ zscal_kernel_R_zero: ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I #endif -zscal_kernel_R_zero_1: +.Lzscal_kernel_R_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 @@ -337,7 +337,7 @@ zscal_kernel_R_zero_1: #endif add X, X, INC_X subs N, N, #1 - bne zscal_kernel_R_zero_1 + bne .Lzscal_kernel_R_zero_1 mov w0, wzr ret @@ -346,7 +346,7 @@ zscal_kernel_R_zero_1: * A_R != 0 && A_I == 0 *******************************************************************************/ -zscal_kernel_I_zero: +.Lzscal_kernel_I_zero: INIT_S #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R @@ -354,7 +354,7 @@ zscal_kernel_I_zero: ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R #endif -zscal_kernel_I_zero_1: +.Lzscal_kernel_I_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 @@ -366,7 +366,7 @@ zscal_kernel_I_zero_1: #endif add X, X, INC_X subs N, N, #1 - bne zscal_kernel_I_zero_1 + bne .Lzscal_kernel_I_zero_1 mov w0, wzr ret @@ -375,16 +375,16 @@ zscal_kernel_I_zero_1: * A_R == 0 && A_I == 0 *******************************************************************************/ -zscal_kernel_RI_zero: +.Lzscal_kernel_RI_zero: INIT_S -zscal_kernel_RI_zero_1: +.Lzscal_kernel_RI_zero_1: stp DA_R, DA_I, [X] add X, X, INC_X subs N, N, #1 - bne zscal_kernel_RI_zero_1 + bne .Lzscal_kernel_RI_zero_1 mov w0, wzr ret diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 77a7857ff..462acfe2b 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ztrmm_kernel_L2_BEGIN + ble .Lztrmm_kernel_L2_BEGIN -ztrmm_kernel_L4_BEGIN: +.Lztrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ztrmm_kernel_L4_M4_BEGIN: +.Lztrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ztrmm_kernel_L4_M2_BEGIN + ble .Lztrmm_kernel_L4_M2_BEGIN .align 5 -ztrmm_kernel_L4_M4_20: +.Lztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20: asr counterL , tempK, #3 cmp counterL , #2 - blt ztrmm_kernel_L4_M4_32 + blt .Lztrmm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 - ble ztrmm_kernel_L4_M4_22a + ble .Lztrmm_kernel_L4_M4_22a .align 5 -ztrmm_kernel_L4_M4_22: +.Lztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M4_22 + bgt .Lztrmm_kernel_L4_M4_22 .align 5 -ztrmm_kernel_L4_M4_22a: +.Lztrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ztrmm_kernel_L4_M4_44 + b .Lztrmm_kernel_L4_M4_44 .align 5 -ztrmm_kernel_L4_M4_32: +.Lztrmm_kernel_L4_M4_32: tst counterL, #1 - ble ztrmm_kernel_L4_M4_40 + ble .Lztrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b ztrmm_kernel_L4_M4_44 + b .Lztrmm_kernel_L4_M4_44 -ztrmm_kernel_L4_M4_40: +.Lztrmm_kernel_L4_M4_40: INIT4x4 -ztrmm_kernel_L4_M4_44: +.Lztrmm_kernel_L4_M4_44: ands counterL , tempK, #7 - ble ztrmm_kernel_L4_M4_100 + ble .Lztrmm_kernel_L4_M4_100 .align 5 -ztrmm_kernel_L4_M4_46: +.Lztrmm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne ztrmm_kernel_L4_M4_46 + bne .Lztrmm_kernel_L4_M4_46 -ztrmm_kernel_L4_M4_100: +.Lztrmm_kernel_L4_M4_100: SAVE4x4 @@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -ztrmm_kernel_L4_M4_END: +.Lztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne ztrmm_kernel_L4_M4_20 + bne .Lztrmm_kernel_L4_M4_20 -ztrmm_kernel_L4_M2_BEGIN: +.Lztrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L4_END + ble .Lztrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L4_M1_BEGIN + ble .Lztrmm_kernel_L4_M1_BEGIN -ztrmm_kernel_L4_M2_20: +.Lztrmm_kernel_L4_M2_20: INIT2x4 @@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L4_M2_40 + ble .Lztrmm_kernel_L4_M2_40 -ztrmm_kernel_L4_M2_22: +.Lztrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M2_22 + bgt .Lztrmm_kernel_L4_M2_22 -ztrmm_kernel_L4_M2_40: +.Lztrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L4_M2_100 + ble .Lztrmm_kernel_L4_M2_100 -ztrmm_kernel_L4_M2_42: +.Lztrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M2_42 + bgt .Lztrmm_kernel_L4_M2_42 -ztrmm_kernel_L4_M2_100: +.Lztrmm_kernel_L4_M2_100: SAVE2x4 @@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L4_M2_END: +.Lztrmm_kernel_L4_M2_END: -ztrmm_kernel_L4_M1_BEGIN: +.Lztrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L4_END + ble .Lztrmm_kernel_L4_END -ztrmm_kernel_L4_M1_20: +.Lztrmm_kernel_L4_M1_20: INIT1x4 @@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L4_M1_40 + ble .Lztrmm_kernel_L4_M1_40 -ztrmm_kernel_L4_M1_22: +.Lztrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M1_22 + bgt .Lztrmm_kernel_L4_M1_22 -ztrmm_kernel_L4_M1_40: +.Lztrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L4_M1_100 + ble .Lztrmm_kernel_L4_M1_100 -ztrmm_kernel_L4_M1_42: +.Lztrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M1_42 + bgt .Lztrmm_kernel_L4_M1_42 -ztrmm_kernel_L4_M1_100: +.Lztrmm_kernel_L4_M1_100: SAVE1x4 @@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100: #endif -ztrmm_kernel_L4_END: +.Lztrmm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 @@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ztrmm_kernel_L4_BEGIN + bgt .Lztrmm_kernel_L4_BEGIN /******************************************************************************/ -ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ztrmm_kernel_L999 // error, N was less than 4? + ble .Lztrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ztrmm_kernel_L1_BEGIN + ble .Lztrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -ztrmm_kernel_L2_M4_BEGIN: +.Lztrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble ztrmm_kernel_L2_M2_BEGIN + ble .Lztrmm_kernel_L2_M2_BEGIN -ztrmm_kernel_L2_M4_20: +.Lztrmm_kernel_L2_M4_20: INIT4x2 @@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ztrmm_kernel_L2_M4_40 + ble .Lztrmm_kernel_L2_M4_40 .align 5 -ztrmm_kernel_L2_M4_22: +.Lztrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M4_22 + bgt .Lztrmm_kernel_L2_M4_22 -ztrmm_kernel_L2_M4_40: +.Lztrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M4_100 + ble .Lztrmm_kernel_L2_M4_100 -ztrmm_kernel_L2_M4_42: +.Lztrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M4_42 + bgt .Lztrmm_kernel_L2_M4_42 -ztrmm_kernel_L2_M4_100: +.Lztrmm_kernel_L2_M4_100: SAVE4x2 @@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ztrmm_kernel_L2_M4_END: +.Lztrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt ztrmm_kernel_L2_M4_20 + bgt .Lztrmm_kernel_L2_M4_20 -ztrmm_kernel_L2_M2_BEGIN: +.Lztrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L2_END + ble .Lztrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L2_M1_BEGIN + ble .Lztrmm_kernel_L2_M1_BEGIN -ztrmm_kernel_L2_M2_20: +.Lztrmm_kernel_L2_M2_20: INIT2x2 @@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ztrmm_kernel_L2_M2_40 + ble .Lztrmm_kernel_L2_M2_40 -ztrmm_kernel_L2_M2_22: +.Lztrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M2_22 + bgt .Lztrmm_kernel_L2_M2_22 -ztrmm_kernel_L2_M2_40: +.Lztrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M2_100 + ble .Lztrmm_kernel_L2_M2_100 -ztrmm_kernel_L2_M2_42: +.Lztrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M2_42 + bgt .Lztrmm_kernel_L2_M2_42 -ztrmm_kernel_L2_M2_100: +.Lztrmm_kernel_L2_M2_100: SAVE2x2 @@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L2_M2_END: +.Lztrmm_kernel_L2_M2_END: -ztrmm_kernel_L2_M1_BEGIN: +.Lztrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L2_END + ble .Lztrmm_kernel_L2_END -ztrmm_kernel_L2_M1_20: +.Lztrmm_kernel_L2_M1_20: INIT1x2 @@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ztrmm_kernel_L2_M1_40 + ble .Lztrmm_kernel_L2_M1_40 -ztrmm_kernel_L2_M1_22: +.Lztrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M1_22 + bgt .Lztrmm_kernel_L2_M1_22 -ztrmm_kernel_L2_M1_40: +.Lztrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M1_100 + ble .Lztrmm_kernel_L2_M1_100 -ztrmm_kernel_L2_M1_42: +.Lztrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M1_42 + bgt .Lztrmm_kernel_L2_M1_42 -ztrmm_kernel_L2_M1_100: +.Lztrmm_kernel_L2_M1_100: SAVE1x2 @@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100: #endif -ztrmm_kernel_L2_END: +.Lztrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END: /******************************************************************************/ -ztrmm_kernel_L1_BEGIN: +.Lztrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ztrmm_kernel_L999 // done + ble .Lztrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN: -ztrmm_kernel_L1_M4_BEGIN: +.Lztrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ztrmm_kernel_L1_M2_BEGIN + ble .Lztrmm_kernel_L1_M2_BEGIN -ztrmm_kernel_L1_M4_20: +.Lztrmm_kernel_L1_M4_20: INIT4x1 @@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M4_40 + ble .Lztrmm_kernel_L1_M4_40 .align 5 -ztrmm_kernel_L1_M4_22: +.Lztrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M4_22 + bgt .Lztrmm_kernel_L1_M4_22 -ztrmm_kernel_L1_M4_40: +.Lztrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M4_100 + ble .Lztrmm_kernel_L1_M4_100 -ztrmm_kernel_L1_M4_42: +.Lztrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M4_42 + bgt .Lztrmm_kernel_L1_M4_42 -ztrmm_kernel_L1_M4_100: +.Lztrmm_kernel_L1_M4_100: SAVE4x1 @@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ztrmm_kernel_L1_M4_END: +.Lztrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt ztrmm_kernel_L1_M4_20 + bgt .Lztrmm_kernel_L1_M4_20 -ztrmm_kernel_L1_M2_BEGIN: +.Lztrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L1_END + ble .Lztrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L1_M1_BEGIN + ble .Lztrmm_kernel_L1_M1_BEGIN -ztrmm_kernel_L1_M2_20: +.Lztrmm_kernel_L1_M2_20: INIT2x1 @@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M2_40 + ble .Lztrmm_kernel_L1_M2_40 -ztrmm_kernel_L1_M2_22: +.Lztrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M2_22 + bgt .Lztrmm_kernel_L1_M2_22 -ztrmm_kernel_L1_M2_40: +.Lztrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M2_100 + ble .Lztrmm_kernel_L1_M2_100 -ztrmm_kernel_L1_M2_42: +.Lztrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M2_42 + bgt .Lztrmm_kernel_L1_M2_42 -ztrmm_kernel_L1_M2_100: +.Lztrmm_kernel_L1_M2_100: SAVE2x1 @@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L1_M2_END: +.Lztrmm_kernel_L1_M2_END: -ztrmm_kernel_L1_M1_BEGIN: +.Lztrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L1_END + ble .Lztrmm_kernel_L1_END -ztrmm_kernel_L1_M1_20: +.Lztrmm_kernel_L1_M1_20: INIT1x1 @@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M1_40 + ble .Lztrmm_kernel_L1_M1_40 -ztrmm_kernel_L1_M1_22: +.Lztrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M1_22 + bgt .Lztrmm_kernel_L1_M1_22 -ztrmm_kernel_L1_M1_40: +.Lztrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M1_100 + ble .Lztrmm_kernel_L1_M1_100 -ztrmm_kernel_L1_M1_42: +.Lztrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M1_42 + bgt .Lztrmm_kernel_L1_M1_42 -ztrmm_kernel_L1_M1_100: +.Lztrmm_kernel_L1_M1_100: SAVE1x1 -ztrmm_kernel_L1_END: +.Lztrmm_kernel_L1_END: -ztrmm_kernel_L999: +.Lztrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)]