Merge pull request #1334 from ashwinyes/develop_aarch64_20171024_addlocallabels

ARM64: Convert all labels to local labels
This commit is contained in:
Martin Kroeker 2017-10-24 19:50:03 +02:00 committed by GitHub
commit b71f4fe681
50 changed files with 4469 additions and 4469 deletions

View File

@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero
cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lamax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
.Lamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq amax_kernel_F1
beq .Lamax_kernel_F1
amax_kernel_F4:
.Lamax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne amax_kernel_F4
bne .Lamax_kernel_F4
amax_kernel_F1:
.Lamax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999
amax_kernel_F10:
.Lamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
bne .Lamax_kernel_F10
ret
amax_kernel_F1_INIT:
.Lamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b amax_kernel_F1
b .Lamax_kernel_F1
amax_kernel_S_BEGIN:
.Lamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble amax_kernel_L999
ble .Lamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
ble .Lamax_kernel_S1
amax_kernel_S4:
.Lamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -223,25 +223,25 @@ amax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
bne .Lamax_kernel_S4
amax_kernel_S1:
.Lamax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999
amax_kernel_S10:
.Lamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
bne .Lamax_kernel_S10
amax_kernel_L999:
.Lamax_kernel_L999:
ret
amax_kernel_zero:
.Lamax_kernel_zero:
fmov MAXF, REG0
ret

View File

@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
cmp N, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lasum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
.Lasum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
beq .Lasum_kernel_F1
asum_kernel_F8:
.Lasum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne asum_kernel_F8
bne .Lasum_kernel_F8
KERNEL_F8_FINALIZE
asum_kernel_F1:
.Lasum_kernel_F1:
ands I, N, #7
ble asum_kernel_L999
ble .Lasum_kernel_L999
asum_kernel_F10:
.Lasum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
bne .Lasum_kernel_F10
asum_kernel_L999:
.Lasum_kernel_L999:
ret
asum_kernel_S_BEGIN:
.Lasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lasum_kernel_S1
asum_kernel_S4:
.Lasum_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -175,19 +175,19 @@ asum_kernel_S4:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
bne .Lasum_kernel_S4
asum_kernel_S1:
.Lasum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
ble .Lasum_kernel_L999
asum_kernel_S10:
.Lasum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
bne .Lasum_kernel_S10
ret

View File

@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble axpy_kernel_L999
ble .Laxpy_kernel_L999
fcmp DA, #0.0
beq axpy_kernel_L999
beq .Laxpy_kernel_L999
cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN:
.Laxpy_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq axpy_kernel_F1
beq .Laxpy_kernel_F1
axpy_kernel_F8:
.Laxpy_kernel_F8:
KERNEL_F8
subs I, I, #1
bne axpy_kernel_F8
bne .Laxpy_kernel_F8
axpy_kernel_F1:
.Laxpy_kernel_F1:
ands I, N, #7
ble axpy_kernel_L999
ble .Laxpy_kernel_L999
axpy_kernel_F10:
.Laxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne axpy_kernel_F10
bne .Laxpy_kernel_F10
mov w0, wzr
ret
axpy_kernel_S_BEGIN:
.Laxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1
ble .Laxpy_kernel_S1
axpy_kernel_S4:
.Laxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -189,21 +189,21 @@ axpy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S4
bne .Laxpy_kernel_S4
axpy_kernel_S1:
.Laxpy_kernel_S1:
ands I, N, #3
ble axpy_kernel_L999
ble .Laxpy_kernel_L999
axpy_kernel_S10:
.Laxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S10
bne .Laxpy_kernel_S10
axpy_kernel_L999:
.Laxpy_kernel_L999:
mov w0, wzr
ret

View File

@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov s1, SUMF
cmp N, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lcasum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
.Lcasum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
beq .Lcasum_kernel_F1
asum_kernel_F8:
.Lcasum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne asum_kernel_F8
bne .Lcasum_kernel_F8
KERNEL_F8_FINALIZE
asum_kernel_F1:
.Lcasum_kernel_F1:
ands I, N, #7
ble asum_kernel_L999
ble .Lcasum_kernel_L999
asum_kernel_F10:
.Lcasum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
bne .Lcasum_kernel_F10
asum_kernel_L999:
.Lcasum_kernel_L999:
ret
asum_kernel_S_BEGIN:
.Lcasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lcasum_kernel_S1
asum_kernel_S4:
.Lcasum_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -151,19 +151,19 @@ asum_kernel_S4:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
bne .Lcasum_kernel_S4
asum_kernel_S1:
.Lcasum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
ble .Lcasum_kernel_L999
asum_kernel_S10:
.Lcasum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
bne .Lcasum_kernel_S10
ret

View File

@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
add ppA, temp, pA
cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:
INIT8x4
cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
SAVE8x4
cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_40:
.Lcgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_42:
.Lcgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_42
bgt .Lcgemm_kernel_L4_M4_42
cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:
SAVE4x4
cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:
SAVE2x4
cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:
SAVE1x4
cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5
cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:
SAVE4x2
cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L2_M4_20
bgt .Lcgemm_kernel_L2_M4_20
cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:
SAVE2x2
cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:
SAVE1x2
cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN:
cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5
cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:
SAVE4x1
cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L1_M4_20
bgt .Lcgemm_kernel_L1_M4_20
cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:
SAVE2x1
cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:
SAVE1x1
cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:
cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22
.align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
.align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:
INIT8x4
cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #7
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100
.align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46
cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5
cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:
INIT4x4
cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:
SAVE4x4
cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:
SAVE2x4
cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:
SAVE1x4
cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN
cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:
INIT8x2
@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5
cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22
cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100
cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42
cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:
SAVE8x2
cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20
cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5
cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:
SAVE4x2
cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:
cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:
SAVE2x2
cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:
SAVE1x2
cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN
cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5
cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22
cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100
cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42
cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:
SAVE8x1
cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20
cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5
cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:
SAVE4x1
cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:
cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:
SAVE2x1
cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:
SAVE1x1
cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:
cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #5 // origK / 32
cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x8
subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x16
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22
.align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x8
KERNEL8x4_M1_M2_x4
@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
.align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:
INIT8x4
cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #31
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100
.align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46
cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5
cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:
INIT4x4
cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:
SAVE4x4
cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:
SAVE2x4
cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:
SAVE1x4
cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN
cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:
INIT8x2
@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5
cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22
cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100
cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42
cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:
SAVE8x2
cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20
cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5
cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:
SAVE4x2
cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:
cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:
SAVE2x2
cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:
SAVE1x2
cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN
cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5
cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22
cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100
cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42
cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:
SAVE8x1
cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20
cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5
cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:
SAVE4x1
cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:
cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:
SAVE2x1
cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:
SAVE1x1
cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:
cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble copy_kernel_L999
ble .Lcopy_kernel_L999
cmp INC_X, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN
cmp INC_Y, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN
copy_kernel_F_BEGIN:
.Lcopy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq copy_kernel_F1
beq .Lcopy_kernel_F1
copy_kernel_F4:
.Lcopy_kernel_F4:
KERNEL_F4
subs I, I, #1
bne copy_kernel_F4
bne .Lcopy_kernel_F4
copy_kernel_F1:
.Lcopy_kernel_F1:
ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999
copy_kernel_F10:
.Lcopy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne copy_kernel_F10
bne .Lcopy_kernel_F10
mov w0, wzr
ret
copy_kernel_S_BEGIN:
.Lcopy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble copy_kernel_S1
ble .Lcopy_kernel_S1
copy_kernel_S4:
.Lcopy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -210,21 +210,21 @@ copy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne copy_kernel_S4
bne .Lcopy_kernel_S4
copy_kernel_S1:
.Lcopy_kernel_S1:
ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999
copy_kernel_S10:
.Lcopy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne copy_kernel_S10
bne .Lcopy_kernel_S10
copy_kernel_L999:
.Lcopy_kernel_L999:
mov w0, wzr
ret

View File

@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/
ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN
ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5
ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22
ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32:
.Lctrmm_kernel_L4_M4_32:
tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
.Lctrmm_kernel_L4_M4_40:
INIT4x4
ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100
ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:
SAVE4x4
@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ctrmm_kernel_L4_M4_20
bne .Lctrmm_kernel_L4_M4_20
ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN
ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:
INIT2x4
@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40
ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22
ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100
ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42
ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:
SAVE2x4
@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:
ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:
INIT1x4
@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40
ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22
ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100
ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42
ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:
SAVE1x4
@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/
ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN
ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:
INIT4x2
@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5
ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22
ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100
ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42
ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:
SAVE4x2
@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M4_20
bgt .Lctrmm_kernel_L2_M4_20
ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN
ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:
INIT2x2
@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40
ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22
ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100
ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42
ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:
SAVE2x2
@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:
ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:
INIT1x2
@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40
ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22
ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100
ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42
ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:
SAVE1x2
@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END:
/******************************************************************************/
ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN
ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:
INIT4x1
@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5
ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22
ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100
ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42
ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:
SAVE4x1
@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M4_20
bgt .Lctrmm_kernel_L1_M4_20
ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN
ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:
INIT2x1
@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40
ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22
ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100
ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42
ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:
SAVE2x1
@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:
ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:
INIT1x1
@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40
ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22
ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100
ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42
ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:
SAVE1x1
ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:
ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/
ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
ctrmm_kernel_L4_M8_BEGIN:
.Lctrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L4_M4_BEGIN
ble .Lctrmm_kernel_L4_M4_BEGIN
ctrmm_kernel_L4_M8_20:
.Lctrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20:
asr counterL , tempK, #3
cmp counterL , #2
blt ctrmm_kernel_L4_M8_32
blt .Lctrmm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble ctrmm_kernel_L4_M8_22a
ble .Lctrmm_kernel_L4_M8_22a
.align 5
ctrmm_kernel_L4_M8_22:
.Lctrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M8_22
bgt .Lctrmm_kernel_L4_M8_22
.align 5
ctrmm_kernel_L4_M8_22a:
.Lctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44
.align 5
ctrmm_kernel_L4_M8_32:
.Lctrmm_kernel_L4_M8_32:
tst counterL, #1
ble ctrmm_kernel_L4_M8_40
ble .Lctrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44
ctrmm_kernel_L4_M8_40:
.Lctrmm_kernel_L4_M8_40:
INIT8x4
ctrmm_kernel_L4_M8_44:
.Lctrmm_kernel_L4_M8_44:
ands counterL , tempK, #7
ble ctrmm_kernel_L4_M8_100
ble .Lctrmm_kernel_L4_M8_100
.align 5
ctrmm_kernel_L4_M8_46:
.Lctrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne ctrmm_kernel_L4_M8_46
bne .Lctrmm_kernel_L4_M8_46
ctrmm_kernel_L4_M8_100:
.Lctrmm_kernel_L4_M8_100:
SAVE8x4
@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
ctrmm_kernel_L4_M8_END:
.Lctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne ctrmm_kernel_L4_M8_20
bne .Lctrmm_kernel_L4_M8_20
ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
tst counterI, #4
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN
ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5
ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22
ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_32:
tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_40:
INIT4x4
ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100
ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:
SAVE4x4
@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:
ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN
ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:
INIT2x4
@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40
ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22
ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100
ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42
ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:
SAVE2x4
@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:
ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:
INIT1x4
@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40
ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22
ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100
ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42
ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:
SAVE1x4
@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/
ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif
mov pA, origPA // pA = A
ctrmm_kernel_L2_M8_BEGIN:
.Lctrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L2_M4_BEGIN
ble .Lctrmm_kernel_L2_M4_BEGIN
ctrmm_kernel_L2_M8_20:
.Lctrmm_kernel_L2_M8_20:
INIT8x2
@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M8_40
ble .Lctrmm_kernel_L2_M8_40
.align 5
ctrmm_kernel_L2_M8_22:
.Lctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_22
bgt .Lctrmm_kernel_L2_M8_22
ctrmm_kernel_L2_M8_40:
.Lctrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M8_100
ble .Lctrmm_kernel_L2_M8_100
ctrmm_kernel_L2_M8_42:
.Lctrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_42
bgt .Lctrmm_kernel_L2_M8_42
ctrmm_kernel_L2_M8_100:
.Lctrmm_kernel_L2_M8_100:
SAVE8x2
@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8
#endif
ctrmm_kernel_L2_M8_END:
.Lctrmm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M8_20
bgt .Lctrmm_kernel_L2_M8_20
ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN
ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:
INIT4x2
@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5
ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22
ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100
ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42
ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:
SAVE4x2
@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:
ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN
ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:
INIT2x2
@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40
ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22
ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100
ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42
ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:
SAVE2x2
@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:
ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:
INIT1x2
@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40
ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22
ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100
ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42
ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:
SAVE1x2
@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END:
/******************************************************************************/
ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A
ctrmm_kernel_L1_M8_BEGIN:
.Lctrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L1_M4_BEGIN
ble .Lctrmm_kernel_L1_M4_BEGIN
ctrmm_kernel_L1_M8_20:
.Lctrmm_kernel_L1_M8_20:
INIT8x1
@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M8_40
ble .Lctrmm_kernel_L1_M8_40
.align 5
ctrmm_kernel_L1_M8_22:
.Lctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_22
bgt .Lctrmm_kernel_L1_M8_22
ctrmm_kernel_L1_M8_40:
.Lctrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M8_100
ble .Lctrmm_kernel_L1_M8_100
ctrmm_kernel_L1_M8_42:
.Lctrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_42
bgt .Lctrmm_kernel_L1_M8_42
ctrmm_kernel_L1_M8_100:
.Lctrmm_kernel_L1_M8_100:
SAVE8x1
@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8
#endif
ctrmm_kernel_L1_M8_END:
.Lctrmm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M8_20
bgt .Lctrmm_kernel_L1_M8_20
ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN
ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:
INIT4x1
@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5
ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22
ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100
ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42
ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:
SAVE4x1
@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:
ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN
ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:
INIT2x1
@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40
ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22
ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100
ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42
ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:
SAVE2x1
@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:
ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:
INIT1x1
@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40
ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22
ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100
ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42
ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:
SAVE1x1
ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:
ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999
fcmp DA, #0.0
beq axpy_kernel_L999
beq .Ldaxpy_kernel_L999
cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN:
.Ldaxpy_kernel_F_BEGIN:
asr I, N, #5
cmp I, xzr
beq axpy_kernel_F1
beq .Ldaxpy_kernel_F1
.align 5
axpy_kernel_F32:
.Ldaxpy_kernel_F32:
KERNEL_F32
subs I, I, #1
bne axpy_kernel_F32
bne .Ldaxpy_kernel_F32
axpy_kernel_F1:
.Ldaxpy_kernel_F1:
ands I, N, #31
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999
axpy_kernel_F10:
.Ldaxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne axpy_kernel_F10
bne .Ldaxpy_kernel_F10
b axpy_kernel_L999
b .Ldaxpy_kernel_L999
axpy_kernel_S_BEGIN:
.Ldaxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1
ble .Ldaxpy_kernel_S1
axpy_kernel_S4:
.Ldaxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -176,21 +176,21 @@ axpy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S4
bne .Ldaxpy_kernel_S4
axpy_kernel_S1:
.Ldaxpy_kernel_S1:
ands I, N, #3
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999
axpy_kernel_S10:
.Ldaxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S10
bne .Ldaxpy_kernel_S10
axpy_kernel_L999:
.Ldaxpy_kernel_L999:
mov w0, wzr
ret

View File

@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:
//------------------------------------------------------------------------------
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #2 // L = K / 4
cmp counterL , #2
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp]
@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble dgemm_kernel_L4_BEGIN
ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L8_BEGIN:
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
mov pA, origPA // pA = start of A array
dgemm_kernel_L8_M4_BEGIN:
.Ldgemm_kernel_L8_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L8_M2_BEGIN
ble .Ldgemm_kernel_L8_M2_BEGIN
dgemm_kernel_L8_M4_20:
.Ldgemm_kernel_L8_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L8_M4_32
blt .Ldgemm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
ble dgemm_kernel_L8_M4_22a
ble .Ldgemm_kernel_L8_M4_22a
.align 5
dgemm_kernel_L8_M4_22:
.Ldgemm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M4_22
bgt .Ldgemm_kernel_L8_M4_22
dgemm_kernel_L8_M4_22a:
.Ldgemm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44
dgemm_kernel_L8_M4_32:
.Ldgemm_kernel_L8_M4_32:
tst counterL, #1
ble dgemm_kernel_L8_M4_40
ble .Ldgemm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44
dgemm_kernel_L8_M4_40:
.Ldgemm_kernel_L8_M4_40:
INIT4x8
dgemm_kernel_L8_M4_44:
.Ldgemm_kernel_L8_M4_44:
ands counterL , origK, #1
ble dgemm_kernel_L8_M4_100
ble .Ldgemm_kernel_L8_M4_100
dgemm_kernel_L8_M4_46:
.Ldgemm_kernel_L8_M4_46:
KERNEL4x8_SUB
dgemm_kernel_L8_M4_100:
.Ldgemm_kernel_L8_M4_100:
SAVE4x8
dgemm_kernel_L8_M4_END:
.Ldgemm_kernel_L8_M4_END:
subs counterI, counterI, #1
bne dgemm_kernel_L8_M4_20
bne .Ldgemm_kernel_L8_M4_20
dgemm_kernel_L8_M2_BEGIN:
.Ldgemm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L8_M1_BEGIN
ble .Ldgemm_kernel_L8_M1_BEGIN
dgemm_kernel_L8_M2_20:
.Ldgemm_kernel_L8_M2_20:
INIT2x8
@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L8_M2_40
ble .Ldgemm_kernel_L8_M2_40
dgemm_kernel_L8_M2_22:
.Ldgemm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_22
bgt .Ldgemm_kernel_L8_M2_22
dgemm_kernel_L8_M2_40:
.Ldgemm_kernel_L8_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M2_100
ble .Ldgemm_kernel_L8_M2_100
dgemm_kernel_L8_M2_42:
.Ldgemm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_42
bgt .Ldgemm_kernel_L8_M2_42
dgemm_kernel_L8_M2_100:
.Ldgemm_kernel_L8_M2_100:
SAVE2x8
dgemm_kernel_L8_M2_END:
.Ldgemm_kernel_L8_M2_END:
dgemm_kernel_L8_M1_BEGIN:
.Ldgemm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END
dgemm_kernel_L8_M1_20:
.Ldgemm_kernel_L8_M1_20:
INIT1x8
@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L8_M1_40
ble .Ldgemm_kernel_L8_M1_40
dgemm_kernel_L8_M1_22:
.Ldgemm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_22
bgt .Ldgemm_kernel_L8_M1_22
dgemm_kernel_L8_M1_40:
.Ldgemm_kernel_L8_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M1_100
ble .Ldgemm_kernel_L8_M1_100
dgemm_kernel_L8_M1_42:
.Ldgemm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_42
bgt .Ldgemm_kernel_L8_M1_42
dgemm_kernel_L8_M1_100:
.Ldgemm_kernel_L8_M1_100:
SAVE1x8
dgemm_kernel_L8_END:
.Ldgemm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L8_BEGIN
bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
ble dgemm_kernel_L999
ble .Ldgemm_kernel_L999
tst counterJ , #4
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M4_32
blt .Ldgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble dgemm_kernel_L4_M4_22a
ble .Ldgemm_kernel_L4_M4_22a
.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_22a:
.Ldgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_32:
.Ldgemm_kernel_L4_M4_32:
tst counterL, #1
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
INIT4x4
dgemm_kernel_L4_M4_44:
.Ldgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_46:
.Ldgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M4_20
bne .Ldgemm_kernel_L4_M4_20
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/
.align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
.align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
.align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pCRow1, pCRow0, LDC
@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:
INIT8x2
@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40
.align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42
dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:
SAVE8x2
dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A
dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40
.align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22
dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42
dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:
SAVE8x1
dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/
.align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #7 // L = K / 128
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x1
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x64
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x32
KERNEL8x4_M1_M2_x16
@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #127
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100:
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
.align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
.align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x4_SUB
@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pCRow1, pCRow0, LDC
@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:
INIT8x2
@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40
.align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42
dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:
SAVE8x2
dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x2_SUB
@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A
dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40
.align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22
dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42
dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:
SAVE8x1
dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x1_SUB
@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, LDA, #3 // LDA = LDA * SIZE
dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:
asr J, N, #2 // J = N / 4
cmp J, #0
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN
.align 5
dgemm_ncopy_L4_M4_BEGIN:
.Ldgemm_ncopy_L4_M4_BEGIN:
mov A01, A00
add A02, A01, LDA
@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN:
asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L4_M4_40
ble .Ldgemm_ncopy_L4_M4_40
.align 5
dgemm_ncopy_L4_M4_20:
.Ldgemm_ncopy_L4_M4_20:
COPY4x4
subs I , I , #1
bne dgemm_ncopy_L4_M4_20
bne .Ldgemm_ncopy_L4_M4_20
dgemm_ncopy_L4_M4_40:
.Ldgemm_ncopy_L4_M4_40:
and I, M , #3
cmp I, #0
ble dgemm_ncopy_L4_M4_END
ble .Ldgemm_ncopy_L4_M4_END
.align 5
dgemm_ncopy_L4_M4_60:
.Ldgemm_ncopy_L4_M4_60:
COPY1x4
subs I , I , #1
bne dgemm_ncopy_L4_M4_60
bne .Ldgemm_ncopy_L4_M4_60
dgemm_ncopy_L4_M4_END:
.Ldgemm_ncopy_L4_M4_END:
subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN
bne .Ldgemm_ncopy_L4_M4_BEGIN
/*********************************************************************************************/
dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:
tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M4_BEGIN:
.Ldgemm_ncopy_L2_M4_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA
asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L2_M4_40
ble .Ldgemm_ncopy_L2_M4_40
.align 5
dgemm_ncopy_L2_M4_20:
.Ldgemm_ncopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne dgemm_ncopy_L2_M4_20
bne .Ldgemm_ncopy_L2_M4_20
dgemm_ncopy_L2_M4_40:
.Ldgemm_ncopy_L2_M4_40:
and I, M , #3
cmp I, #0
ble dgemm_ncopy_L2_M4_END
ble .Ldgemm_ncopy_L2_M4_END
.align 5
dgemm_ncopy_L2_M4_60:
.Ldgemm_ncopy_L2_M4_60:
COPY1x2
subs I , I , #1
bne dgemm_ncopy_L2_M4_60
bne .Ldgemm_ncopy_L2_M4_60
dgemm_ncopy_L2_M4_END:
.Ldgemm_ncopy_L2_M4_END:
/*********************************************************************************************/
dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:
tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
dgemm_ncopy_L1_M4_BEGIN:
.Ldgemm_ncopy_L1_M4_BEGIN:
mov A01, A00
asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L1_M4_40
ble .Ldgemm_ncopy_L1_M4_40
.align 5
dgemm_ncopy_L1_M4_20:
.Ldgemm_ncopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne dgemm_ncopy_L1_M4_20
bne .Ldgemm_ncopy_L1_M4_20
dgemm_ncopy_L1_M4_40:
.Ldgemm_ncopy_L1_M4_40:
and I, M , #3
cmp I, #0
ble dgemm_ncopy_L1_M4_END
ble .Ldgemm_ncopy_L1_M4_END
.align 5
dgemm_ncopy_L1_M4_60:
.Ldgemm_ncopy_L1_M4_60:
COPY1x1
subs I , I , #1
bne dgemm_ncopy_L1_M4_60
bne .Ldgemm_ncopy_L1_M4_60
dgemm_ncopy_L1_M4_END:
.Ldgemm_ncopy_L1_M4_END:
dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:
mov x0, #0
RESTORE_REGS

View File

@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, LDA, #3 // LDA = LDA * SIZE
dgemm_ncopy_L8_BEGIN:
.Ldgemm_ncopy_L8_BEGIN:
asr J, N, #3 // J = N / 8
cmp J, #0
ble dgemm_ncopy_L4_BEGIN
ble .Ldgemm_ncopy_L4_BEGIN
dgemm_ncopy_L8_M8_BEGIN:
.Ldgemm_ncopy_L8_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN:
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L8_M8_40
ble .Ldgemm_ncopy_L8_M8_40
dgemm_ncopy_L8_M8_20:
.Ldgemm_ncopy_L8_M8_20:
COPY8x8
subs I , I , #1
bne dgemm_ncopy_L8_M8_20
bne .Ldgemm_ncopy_L8_M8_20
dgemm_ncopy_L8_M8_40:
.Ldgemm_ncopy_L8_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L8_M8_END
ble .Ldgemm_ncopy_L8_M8_END
dgemm_ncopy_L8_M8_60:
.Ldgemm_ncopy_L8_M8_60:
COPY1x8
subs I , I , #1
bne dgemm_ncopy_L8_M8_60
bne .Ldgemm_ncopy_L8_M8_60
dgemm_ncopy_L8_M8_END:
.Ldgemm_ncopy_L8_M8_END:
subs J , J, #1 // j--
bne dgemm_ncopy_L8_M8_BEGIN
bne .Ldgemm_ncopy_L8_M8_BEGIN
/*********************************************************************************************/
dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:
tst N, #7
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
tst N, #4
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN
dgemm_ncopy_L4_M8_BEGIN:
.Ldgemm_ncopy_L4_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN:
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L4_M8_40
ble .Ldgemm_ncopy_L4_M8_40
dgemm_ncopy_L4_M8_20:
.Ldgemm_ncopy_L4_M8_20:
COPY8x4
subs I , I , #1
bne dgemm_ncopy_L4_M8_20
bne .Ldgemm_ncopy_L4_M8_20
dgemm_ncopy_L4_M8_40:
.Ldgemm_ncopy_L4_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L4_M8_END
ble .Ldgemm_ncopy_L4_M8_END
dgemm_ncopy_L4_M8_60:
.Ldgemm_ncopy_L4_M8_60:
COPY1x4
subs I , I , #1
bne dgemm_ncopy_L4_M8_60
bne .Ldgemm_ncopy_L4_M8_60
dgemm_ncopy_L4_M8_END:
.Ldgemm_ncopy_L4_M8_END:
/*********************************************************************************************/
dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:
tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M8_BEGIN:
.Ldgemm_ncopy_L2_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L2_M8_40
ble .Ldgemm_ncopy_L2_M8_40
dgemm_ncopy_L2_M8_20:
.Ldgemm_ncopy_L2_M8_20:
COPY8x2
subs I , I , #1
bne dgemm_ncopy_L2_M8_20
bne .Ldgemm_ncopy_L2_M8_20
dgemm_ncopy_L2_M8_40:
.Ldgemm_ncopy_L2_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L2_M8_END
ble .Ldgemm_ncopy_L2_M8_END
dgemm_ncopy_L2_M8_60:
.Ldgemm_ncopy_L2_M8_60:
COPY1x2
subs I , I , #1
bne dgemm_ncopy_L2_M8_60
bne .Ldgemm_ncopy_L2_M8_60
dgemm_ncopy_L2_M8_END:
.Ldgemm_ncopy_L2_M8_END:
/*********************************************************************************************/
dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:
tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
dgemm_ncopy_L1_M8_BEGIN:
.Ldgemm_ncopy_L1_M8_BEGIN:
mov A01, A00
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L1_M8_40
ble .Ldgemm_ncopy_L1_M8_40
dgemm_ncopy_L1_M8_20:
.Ldgemm_ncopy_L1_M8_20:
COPY8x1
subs I , I , #1
bne dgemm_ncopy_L1_M8_20
bne .Ldgemm_ncopy_L1_M8_20
dgemm_ncopy_L1_M8_40:
.Ldgemm_ncopy_L1_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L1_M8_END
ble .Ldgemm_ncopy_L1_M8_END
dgemm_ncopy_L1_M8_60:
.Ldgemm_ncopy_L1_M8_60:
COPY1x1
subs I , I , #1
bne dgemm_ncopy_L1_M8_60
bne .Ldgemm_ncopy_L1_M8_60
dgemm_ncopy_L1_M8_END:
.Ldgemm_ncopy_L1_M8_END:
dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:
mov x0, #0
RESTORE_REGS

View File

@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl M4, M, #5 // M4 = M * 4 * SIZE
dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
asr J, M, #2 // J = M / 4
cmp J, #0
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN
.align 5
dgemm_tcopy_L4_M4_BEGIN:
.Ldgemm_tcopy_L4_M4_BEGIN:
mov A01, A
add A02, A01, LDA
@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN:
asr I, N, #2 // I = N / 4
cmp I, #0
ble dgemm_tcopy_L4_M4_40
ble .Ldgemm_tcopy_L4_M4_40
.align 5
dgemm_tcopy_L4_M4_20:
.Ldgemm_tcopy_L4_M4_20:
COPY4x4
subs I , I , #1
bne dgemm_tcopy_L4_M4_20
bne .Ldgemm_tcopy_L4_M4_20
dgemm_tcopy_L4_M4_40:
.Ldgemm_tcopy_L4_M4_40:
tst N , #2
ble dgemm_tcopy_L4_M4_60
ble .Ldgemm_tcopy_L4_M4_60
COPY2x4
dgemm_tcopy_L4_M4_60:
.Ldgemm_tcopy_L4_M4_60:
tst N, #1
ble dgemm_tcopy_L4_M4_END
ble .Ldgemm_tcopy_L4_M4_END
COPY1x4
dgemm_tcopy_L4_M4_END:
.Ldgemm_tcopy_L4_M4_END:
subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN
bne .Ldgemm_tcopy_L4_M4_BEGIN
/*********************************************************************************************/
dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:
tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN
dgemm_tcopy_L2_M4_BEGIN:
.Ldgemm_tcopy_L2_M4_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN:
asr I, N, #2 // I = N / 4
cmp I, #0
ble dgemm_tcopy_L2_M4_40
ble .Ldgemm_tcopy_L2_M4_40
.align 5
dgemm_tcopy_L2_M4_20:
.Ldgemm_tcopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne dgemm_tcopy_L2_M4_20
bne .Ldgemm_tcopy_L2_M4_20
dgemm_tcopy_L2_M4_40:
.Ldgemm_tcopy_L2_M4_40:
tst N , #2
ble dgemm_tcopy_L2_M4_60
ble .Ldgemm_tcopy_L2_M4_60
COPY2x2
dgemm_tcopy_L2_M4_60:
.Ldgemm_tcopy_L2_M4_60:
tst N , #1
ble dgemm_tcopy_L2_M4_END
ble .Ldgemm_tcopy_L2_M4_END
COPY1x2
dgemm_tcopy_L2_M4_END:
.Ldgemm_tcopy_L2_M4_END:
/*********************************************************************************************/
dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:
tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
dgemm_tcopy_L1_M4_BEGIN:
.Ldgemm_tcopy_L1_M4_BEGIN:
mov A01, A // A01 = A
mov B01, B
asr I, N, #2 // I = M / 4
cmp I, #0
ble dgemm_tcopy_L1_M4_40
ble .Ldgemm_tcopy_L1_M4_40
.align 5
dgemm_tcopy_L1_M4_20:
.Ldgemm_tcopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne dgemm_tcopy_L1_M4_20
bne .Ldgemm_tcopy_L1_M4_20
dgemm_tcopy_L1_M4_40:
.Ldgemm_tcopy_L1_M4_40:
tst N , #2
ble dgemm_tcopy_L1_M4_60
ble .Ldgemm_tcopy_L1_M4_60
COPY2x1
dgemm_tcopy_L1_M4_60:
.Ldgemm_tcopy_L1_M4_60:
tst N , #1
ble dgemm_tcopy_L1_M4_END
ble .Ldgemm_tcopy_L1_M4_END
COPY1x1
dgemm_tcopy_L1_M4_END:
.Ldgemm_tcopy_L1_M4_END:
dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret

View File

@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl M8, M, #6 // M8 = M * 8 * SIZE
dgemm_tcopy_L8_BEGIN:
.Ldgemm_tcopy_L8_BEGIN:
asr J, M, #3 // J = M / 4
cmp J, #0
ble dgemm_tcopy_L4_BEGIN
ble .Ldgemm_tcopy_L4_BEGIN
.align 5
dgemm_tcopy_L8_M8_BEGIN:
.Ldgemm_tcopy_L8_M8_BEGIN:
mov A01, A
add A02, A01, LDA
@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L8_M8_40
ble .Ldgemm_tcopy_L8_M8_40
.align 5
dgemm_tcopy_L8_M8_20:
.Ldgemm_tcopy_L8_M8_20:
COPY8x8
subs I , I , #1
bne dgemm_tcopy_L8_M8_20
bne .Ldgemm_tcopy_L8_M8_20
dgemm_tcopy_L8_M8_40:
.Ldgemm_tcopy_L8_M8_40:
tst N , #4
ble dgemm_tcopy_L8_M8_60
ble .Ldgemm_tcopy_L8_M8_60
COPY4x8
dgemm_tcopy_L8_M8_60:
.Ldgemm_tcopy_L8_M8_60:
tst N , #2
ble dgemm_tcopy_L8_M8_80
ble .Ldgemm_tcopy_L8_M8_80
COPY2x8
dgemm_tcopy_L8_M8_80:
.Ldgemm_tcopy_L8_M8_80:
tst N, #1
ble dgemm_tcopy_L8_M8_END
ble .Ldgemm_tcopy_L8_M8_END
COPY1x8
dgemm_tcopy_L8_M8_END:
.Ldgemm_tcopy_L8_M8_END:
subs J , J, #1 // j--
bne dgemm_tcopy_L8_M8_BEGIN
bne .Ldgemm_tcopy_L8_M8_BEGIN
/*********************************************************************************************/
dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
tst M, #7
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
tst M, #4
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN
dgemm_tcopy_L4_M8_BEGIN:
.Ldgemm_tcopy_L4_M8_BEGIN:
mov A01, A
add A02, A01, LDA
@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L4_M8_40
ble .Ldgemm_tcopy_L4_M8_40
.align 5
dgemm_tcopy_L4_M8_20:
.Ldgemm_tcopy_L4_M8_20:
COPY8x4
subs I , I , #1
bne dgemm_tcopy_L4_M8_20
bne .Ldgemm_tcopy_L4_M8_20
dgemm_tcopy_L4_M8_40:
.Ldgemm_tcopy_L4_M8_40:
tst N , #4
ble dgemm_tcopy_L4_M8_60
ble .Ldgemm_tcopy_L4_M8_60
COPY4x4
dgemm_tcopy_L4_M8_60:
.Ldgemm_tcopy_L4_M8_60:
tst N , #2
ble dgemm_tcopy_L4_M8_80
ble .Ldgemm_tcopy_L4_M8_80
COPY2x4
dgemm_tcopy_L4_M8_80:
.Ldgemm_tcopy_L4_M8_80:
tst N, #1
ble dgemm_tcopy_L4_M8_END
ble .Ldgemm_tcopy_L4_M8_END
COPY1x4
dgemm_tcopy_L4_M8_END:
.Ldgemm_tcopy_L4_M8_END:
/*********************************************************************************************/
dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:
tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN
dgemm_tcopy_L2_M8_BEGIN:
.Ldgemm_tcopy_L2_M8_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L2_M8_40
ble .Ldgemm_tcopy_L2_M8_40
.align 5
dgemm_tcopy_L2_M8_20:
.Ldgemm_tcopy_L2_M8_20:
COPY8x2
subs I , I , #1
bne dgemm_tcopy_L2_M8_20
bne .Ldgemm_tcopy_L2_M8_20
dgemm_tcopy_L2_M8_40:
.Ldgemm_tcopy_L2_M8_40:
tst N , #4
ble dgemm_tcopy_L2_M8_60
ble .Ldgemm_tcopy_L2_M8_60
COPY4x2
dgemm_tcopy_L2_M8_60:
.Ldgemm_tcopy_L2_M8_60:
tst N , #2
ble dgemm_tcopy_L2_M8_80
ble .Ldgemm_tcopy_L2_M8_80
COPY2x2
dgemm_tcopy_L2_M8_80:
.Ldgemm_tcopy_L2_M8_80:
tst N , #1
ble dgemm_tcopy_L2_M8_END
ble .Ldgemm_tcopy_L2_M8_END
COPY1x2
dgemm_tcopy_L2_M8_END:
.Ldgemm_tcopy_L2_M8_END:
/*********************************************************************************************/
dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:
tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
dgemm_tcopy_L1_M8_BEGIN:
.Ldgemm_tcopy_L1_M8_BEGIN:
mov A01, A // A01 = A
mov B01, B
asr I, N, #3 // I = M / 8
cmp I, #0
ble dgemm_tcopy_L1_M8_40
ble .Ldgemm_tcopy_L1_M8_40
.align 5
dgemm_tcopy_L1_M8_20:
.Ldgemm_tcopy_L1_M8_20:
COPY8x1
subs I , I , #1
bne dgemm_tcopy_L1_M8_20
bne .Ldgemm_tcopy_L1_M8_20
dgemm_tcopy_L1_M8_40:
.Ldgemm_tcopy_L1_M8_40:
tst N , #4
ble dgemm_tcopy_L1_M8_60
ble .Ldgemm_tcopy_L1_M8_60
COPY4x1
dgemm_tcopy_L1_M8_60:
.Ldgemm_tcopy_L1_M8_60:
tst N , #2
ble dgemm_tcopy_L1_M8_80
ble .Ldgemm_tcopy_L1_M8_80
COPY2x1
dgemm_tcopy_L1_M8_80:
.Ldgemm_tcopy_L1_M8_80:
tst N , #1
ble dgemm_tcopy_L1_M8_END
ble .Ldgemm_tcopy_L1_M8_END
COPY1x1
dgemm_tcopy_L1_M8_END:
.Ldgemm_tcopy_L1_M8_END:
dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret

View File

@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
cmp N, xzr
ble dot_kernel_L999
ble .Ldot_kernel_L999
cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN
dot_kernel_F_BEGIN:
.Ldot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
beq .Ldot_kernel_F1
dot_kernel_F4:
.Ldot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne dot_kernel_F4
bne .Ldot_kernel_F4
KERNEL_F4_FINALIZE
dot_kernel_F1:
.Ldot_kernel_F1:
ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999
dot_kernel_F10:
.Ldot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne dot_kernel_F10
bne .Ldot_kernel_F10
ret
dot_kernel_S_BEGIN:
.Ldot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
ble .Ldot_kernel_S1
dot_kernel_S4:
.Ldot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -206,21 +206,21 @@ dot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S4
bne .Ldot_kernel_S4
dot_kernel_S1:
.Ldot_kernel_S1:
ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999
dot_kernel_S10:
.Ldot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S10
bne .Ldot_kernel_S10
dot_kernel_L999:
.Ldot_kernel_L999:
ret

View File

@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5
dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:
tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:
INIT4x4
dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20
dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:
INIT2x4
@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:
INIT1x4
@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -838,19 +838,19 @@ dtrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:
INIT4x2
@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5
dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20
dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:
INIT2x2
@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:
INIT1x2
@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/
dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:
INIT4x1
@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5
dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20
dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:
INIT2x1
@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:
INIT1x1
@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble dtrmm_kernel_L4_BEGIN
ble .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
dtrmm_kernel_L8_BEGIN:
.Ldtrmm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN:
mov pA, origPA // pA = start of A array
dtrmm_kernel_L8_M4_BEGIN:
.Ldtrmm_kernel_L8_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L8_M2_BEGIN
ble .Ldtrmm_kernel_L8_M2_BEGIN
dtrmm_kernel_L8_M4_20:
.Ldtrmm_kernel_L8_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20:
asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L8_M4_32
blt .Ldtrmm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
ble dtrmm_kernel_L8_M4_22a
ble .Ldtrmm_kernel_L8_M4_22a
.align 5
dtrmm_kernel_L8_M4_22:
.Ldtrmm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M4_22
bgt .Ldtrmm_kernel_L8_M4_22
dtrmm_kernel_L8_M4_22a:
.Ldtrmm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44
dtrmm_kernel_L8_M4_32:
.Ldtrmm_kernel_L8_M4_32:
tst counterL, #1
ble dtrmm_kernel_L8_M4_40
ble .Ldtrmm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44
dtrmm_kernel_L8_M4_40:
.Ldtrmm_kernel_L8_M4_40:
INIT4x8
dtrmm_kernel_L8_M4_44:
.Ldtrmm_kernel_L8_M4_44:
ands counterL, tempK, #1
ble dtrmm_kernel_L8_M4_100
ble .Ldtrmm_kernel_L8_M4_100
dtrmm_kernel_L8_M4_46:
.Ldtrmm_kernel_L8_M4_46:
KERNEL4x8_SUB
dtrmm_kernel_L8_M4_100:
.Ldtrmm_kernel_L8_M4_100:
SAVE4x8
@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L8_M4_END:
.Ldtrmm_kernel_L8_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L8_M4_20
bne .Ldtrmm_kernel_L8_M4_20
dtrmm_kernel_L8_M2_BEGIN:
.Ldtrmm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L8_M1_BEGIN
ble .Ldtrmm_kernel_L8_M1_BEGIN
dtrmm_kernel_L8_M2_20:
.Ldtrmm_kernel_L8_M2_20:
INIT2x8
@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20:
asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L8_M2_40
ble .Ldtrmm_kernel_L8_M2_40
dtrmm_kernel_L8_M2_22:
.Ldtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_22
bgt .Ldtrmm_kernel_L8_M2_22
dtrmm_kernel_L8_M2_40:
.Ldtrmm_kernel_L8_M2_40:
ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M2_100
ble .Ldtrmm_kernel_L8_M2_100
dtrmm_kernel_L8_M2_42:
.Ldtrmm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_42
bgt .Ldtrmm_kernel_L8_M2_42
dtrmm_kernel_L8_M2_100:
.Ldtrmm_kernel_L8_M2_100:
SAVE2x8
@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L8_M2_END:
.Ldtrmm_kernel_L8_M2_END:
dtrmm_kernel_L8_M1_BEGIN:
.Ldtrmm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END
dtrmm_kernel_L8_M1_20:
.Ldtrmm_kernel_L8_M1_20:
INIT1x8
@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20:
asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L8_M1_40
ble .Ldtrmm_kernel_L8_M1_40
dtrmm_kernel_L8_M1_22:
.Ldtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_22
bgt .Ldtrmm_kernel_L8_M1_22
dtrmm_kernel_L8_M1_40:
.Ldtrmm_kernel_L8_M1_40:
ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M1_100
ble .Ldtrmm_kernel_L8_M1_100
dtrmm_kernel_L8_M1_42:
.Ldtrmm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_42
bgt .Ldtrmm_kernel_L8_M1_42
dtrmm_kernel_L8_M1_100:
.Ldtrmm_kernel_L8_M1_100:
SAVE1x8
@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L8_END:
.Ldtrmm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L8_BEGIN
bgt .Ldtrmm_kernel_L8_BEGIN
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
ble dtrmm_kernel_L999
ble .Ldtrmm_kernel_L999
tst counterJ , #4
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20:
asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5
dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:
tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:
INIT4x4
dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20
dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:
INIT2x4
@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:
INIT1x4
@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100:
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END:
/******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:
INIT4x2
@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5
dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20
dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:
INIT2x2
@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:
INIT1x2
@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100:
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/
dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A
dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:
INIT4x1
@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5
dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20
dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:
INIT2x1
@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:
INIT1x1
@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M8_BEGIN:
.Ldtrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
ble .Ldtrmm_kernel_L4_M4_BEGIN
.align 5
dtrmm_kernel_L4_M8_20:
.Ldtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20:
asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
blt .Ldtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
ble .Ldtrmm_kernel_L4_M8_22a
.align 5
dtrmm_kernel_L4_M8_22:
.Ldtrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
bgt .Ldtrmm_kernel_L4_M8_22
.align 5
dtrmm_kernel_L4_M8_22a:
.Ldtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44
.align 5
dtrmm_kernel_L4_M8_32:
.Ldtrmm_kernel_L4_M8_32:
tst counterL, #1
ble dtrmm_kernel_L4_M8_40
ble .Ldtrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44
dtrmm_kernel_L4_M8_40:
.Ldtrmm_kernel_L4_M8_40:
INIT8x4
dtrmm_kernel_L4_M8_44:
.Ldtrmm_kernel_L4_M8_44:
ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
ble .Ldtrmm_kernel_L4_M8_100
.align 5
dtrmm_kernel_L4_M8_46:
.Ldtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
bne .Ldtrmm_kernel_L4_M8_46
dtrmm_kernel_L4_M8_100:
.Ldtrmm_kernel_L4_M8_100:
SAVE8x4
@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
dtrmm_kernel_L4_M8_END:
.Ldtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M8_20
bne .Ldtrmm_kernel_L4_M8_20
dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #4
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:
INIT4x4
@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40
dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_42:
.Ldtrmm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_42
bgt .Ldtrmm_kernel_L4_M4_42
dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:
INIT2x4
@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:
INIT1x4
@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif
mov pA, origPA // pA = A
dtrmm_kernel_L2_M8_BEGIN:
.Ldtrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L2_M4_BEGIN
ble .Ldtrmm_kernel_L2_M4_BEGIN
dtrmm_kernel_L2_M8_20:
.Ldtrmm_kernel_L2_M8_20:
INIT8x2
@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M8_40
ble .Ldtrmm_kernel_L2_M8_40
.align 5
dtrmm_kernel_L2_M8_22:
.Ldtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_22
bgt .Ldtrmm_kernel_L2_M8_22
dtrmm_kernel_L2_M8_40:
.Ldtrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M8_100
ble .Ldtrmm_kernel_L2_M8_100
dtrmm_kernel_L2_M8_42:
.Ldtrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_42
bgt .Ldtrmm_kernel_L2_M8_42
dtrmm_kernel_L2_M8_100:
.Ldtrmm_kernel_L2_M8_100:
SAVE8x2
@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8
#endif
dtrmm_kernel_L2_M8_END:
.Ldtrmm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M8_20
bgt .Ldtrmm_kernel_L2_M8_20
dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:
INIT4x2
@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5
dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:
dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:
INIT2x2
@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:
INIT1x2
@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/
dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A
dtrmm_kernel_L1_M8_BEGIN:
.Ldtrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L1_M4_BEGIN
ble .Ldtrmm_kernel_L1_M4_BEGIN
dtrmm_kernel_L1_M8_20:
.Ldtrmm_kernel_L1_M8_20:
INIT8x1
@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M8_40
ble .Ldtrmm_kernel_L1_M8_40
.align 5
dtrmm_kernel_L1_M8_22:
.Ldtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_22
bgt .Ldtrmm_kernel_L1_M8_22
dtrmm_kernel_L1_M8_40:
.Ldtrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M8_100
ble .Ldtrmm_kernel_L1_M8_100
dtrmm_kernel_L1_M8_42:
.Ldtrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_42
bgt .Ldtrmm_kernel_L1_M8_42
dtrmm_kernel_L1_M8_100:
.Ldtrmm_kernel_L1_M8_100:
SAVE8x1
@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8
#endif
dtrmm_kernel_L1_M8_END:
.Ldtrmm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M8_20
bgt .Ldtrmm_kernel_L1_M8_20
dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:
INIT4x1
@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5
dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:
dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:
INIT2x1
@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:
INIT1x1
@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999
cmp M, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N
cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN
bne .Lgemv_n_kernel_S_BEGIN
gemv_n_kernel_F_LOOP:
.Lgemv_n_kernel_F_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP:
mov Y_IPTR, Y
mov Y_OPTR, Y
gemv_n_kernel_F32:
.Lgemv_n_kernel_F32:
asr I, M, #5
cmp I, xzr
beq gemv_n_kernel_F4
beq .Lgemv_n_kernel_F4
gemv_n_kernel_F320:
.Lgemv_n_kernel_F320:
KERNEL_F16
KERNEL_F16
subs I, I, #1
bne gemv_n_kernel_F320
bne .Lgemv_n_kernel_F320
gemv_n_kernel_F4:
.Lgemv_n_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_n_kernel_F1
beq .Lgemv_n_kernel_F1
gemv_n_kernel_F40:
.Lgemv_n_kernel_F40:
KERNEL_F4
subs I, I, #1
bne gemv_n_kernel_F40
bne .Lgemv_n_kernel_F40
gemv_n_kernel_F1:
.Lgemv_n_kernel_F1:
ands I, M, #3
ble gemv_n_kernel_F_END
ble .Lgemv_n_kernel_F_END
gemv_n_kernel_F10:
.Lgemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
bne gemv_n_kernel_F10
bne .Lgemv_n_kernel_F10
gemv_n_kernel_F_END:
.Lgemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_F_LOOP
bne .Lgemv_n_kernel_F_LOOP
b gemv_n_kernel_L999
b .Lgemv_n_kernel_L999
gemv_n_kernel_S_BEGIN:
.Lgemv_n_kernel_S_BEGIN:
INIT_S
gemv_n_kernel_S_LOOP:
.Lgemv_n_kernel_S_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble gemv_n_kernel_S1
ble .Lgemv_n_kernel_S1
gemv_n_kernel_S4:
.Lgemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -298,27 +298,27 @@ gemv_n_kernel_S4:
KERNEL_S1
subs I, I, #1
bne gemv_n_kernel_S4
bne .Lgemv_n_kernel_S4
gemv_n_kernel_S1:
.Lgemv_n_kernel_S1:
ands I, M, #3
ble gemv_n_kernel_S_END
ble .Lgemv_n_kernel_S_END
gemv_n_kernel_S10:
.Lgemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
bne gemv_n_kernel_S10
bne .Lgemv_n_kernel_S10
gemv_n_kernel_S_END:
.Lgemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_S_LOOP
bne .Lgemv_n_kernel_S_LOOP
gemv_n_kernel_L999:
.Lgemv_n_kernel_L999:
mov w0, wzr

View File

@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999
cmp M, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N
cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN
bne .Lgemv_t_kernel_S_BEGIN
gemv_t_kernel_F_LOOP:
.Lgemv_t_kernel_F_LOOP:
fmov TEMP, REG0
fmov TEMP1, REG0
@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP:
mov A_PTR, A
mov X_PTR, X
gemv_t_kernel_F32:
.Lgemv_t_kernel_F32:
asr I, M, #5
cmp I, xzr
beq gemv_t_kernel_F4
beq .Lgemv_t_kernel_F4
gemv_t_kernel_F320:
.Lgemv_t_kernel_F320:
KERNEL_F32
subs I, I, #1
bne gemv_t_kernel_F320
bne .Lgemv_t_kernel_F320
KERNEL_F32_FINALIZE
gemv_t_kernel_F4:
.Lgemv_t_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_t_kernel_F1
beq .Lgemv_t_kernel_F1
gemv_t_kernel_F40:
.Lgemv_t_kernel_F40:
KERNEL_F4
subs I, I, #1
bne gemv_t_kernel_F40
bne .Lgemv_t_kernel_F40
gemv_t_kernel_F1:
.Lgemv_t_kernel_F1:
KERNEL_F4_FINALIZE
ands I, M, #3
ble gemv_t_kernel_F_END
ble .Lgemv_t_kernel_F_END
gemv_t_kernel_F10:
.Lgemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
bne gemv_t_kernel_F10
bne .Lgemv_t_kernel_F10
gemv_t_kernel_F_END:
.Lgemv_t_kernel_F_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP
bne .Lgemv_t_kernel_F_LOOP
b gemv_t_kernel_L999
b .Lgemv_t_kernel_L999
gemv_t_kernel_S_BEGIN:
.Lgemv_t_kernel_S_BEGIN:
INIT_S
gemv_t_kernel_S_LOOP:
.Lgemv_t_kernel_S_LOOP:
fmov TEMP, REG0
mov A_PTR, A
@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble gemv_t_kernel_S1
ble .Lgemv_t_kernel_S1
gemv_t_kernel_S4:
.Lgemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -329,30 +329,30 @@ gemv_t_kernel_S4:
KERNEL_S1
subs I, I, #1
bne gemv_t_kernel_S4
bne .Lgemv_t_kernel_S4
gemv_t_kernel_S1:
.Lgemv_t_kernel_S1:
ands I, M, #3
ble gemv_t_kernel_S_END
ble .Lgemv_t_kernel_S_END
gemv_t_kernel_S10:
.Lgemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
bne gemv_t_kernel_S10
bne .Lgemv_t_kernel_S10
gemv_t_kernel_S_END:
.Lgemv_t_kernel_S_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP
bne .Lgemv_t_kernel_S_LOOP
gemv_t_kernel_L999:
.Lgemv_t_kernel_L999:
RESTORE_REGS

View File

@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Liamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
.Liamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999
asr I, N, #3
cmp I, xzr
beq iamax_kernel_F1
beq .Liamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
.Liamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
bne .Liamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
.Liamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
ble .Liamax_kernel_L999
iamax_kernel_F10:
.Liamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
bne .Liamax_kernel_F10
b iamax_kernel_L999
b .Liamax_kernel_L999
iamax_kernel_S_BEGIN:
.Liamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
ble .Liamax_kernel_S1
iamax_kernel_S4:
.Liamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -293,25 +293,25 @@ iamax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
bne .Liamax_kernel_S4
iamax_kernel_S1:
.Liamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
ble .Liamax_kernel_L999
iamax_kernel_S10:
.Liamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
bne .Liamax_kernel_S10
iamax_kernel_L999:
.Liamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
.Liamax_kernel_zero:
mov x0, xzr
ret

View File

@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Lizamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
.Lizamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
asr I, N, #3
cmp I, xzr
ble iamax_kernel_F1
ble .Lizamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
.Lizamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
bne .Lizamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
.Lizamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
iamax_kernel_F10:
.Lizamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
bne .Lizamax_kernel_F10
b iamax_kernel_L999
b .Lizamax_kernel_L999
iamax_kernel_S_BEGIN:
.Lizamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
ble .Lizamax_kernel_S1
iamax_kernel_S4:
.Lizamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -341,26 +341,26 @@ iamax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
bne .Lizamax_kernel_S4
iamax_kernel_S1:
.Lizamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
iamax_kernel_S10:
.Lizamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
bne .Lizamax_kernel_S10
iamax_kernel_L999:
.Lizamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
.Lizamax_kernel_zero:
mov x0, xzr
ret

View File

@ -162,44 +162,44 @@ KERNEL_S1_NEXT:
INIT
cmp N, #0
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999
cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lnrm2_kernel_L999
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lnrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
.Lnrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, xzr
ble nrm2_kernel_F1
ble .Lnrm2_kernel_F1
nrm2_kernel_F8:
.Lnrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
bne .Lnrm2_kernel_F8
nrm2_kernel_F1:
.Lnrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999
nrm2_kernel_F10:
.Lnrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
bne .Lnrm2_kernel_F10
b nrm2_kernel_L999
b .Lnrm2_kernel_L999
nrm2_kernel_S_BEGIN:
.Lnrm2_kernel_S_BEGIN:
INIT_S
@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN:
.align 5
nrm2_kernel_S10:
.Lnrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
bne .Lnrm2_kernel_S10
nrm2_kernel_L999:
.Lnrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ

View File

@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble rot_kernel_L999
ble .Lrot_kernel_L999
INIT
cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN
rot_kernel_F_BEGIN:
.Lrot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
beq .Lrot_kernel_F1
KERNEL_INIT_F4
rot_kernel_F4:
.Lrot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne rot_kernel_F4
bne .Lrot_kernel_F4
rot_kernel_F1:
.Lrot_kernel_F1:
ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999
INIT_F1
rot_kernel_F10:
.Lrot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne rot_kernel_F10
bne .Lrot_kernel_F10
mov w0, wzr
ret
rot_kernel_S_BEGIN:
.Lrot_kernel_S_BEGIN:
INIT_S
INIT_F1
@ -214,9 +214,9 @@ rot_kernel_S_BEGIN:
asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
ble .Lrot_kernel_S1
rot_kernel_S4:
.Lrot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -224,22 +224,22 @@ rot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S4
bne .Lrot_kernel_S4
rot_kernel_S1:
.Lrot_kernel_S1:
ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999
rot_kernel_S10:
.Lrot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S10
bne .Lrot_kernel_S10
rot_kernel_L999:
.Lrot_kernel_L999:
mov w0, wzr
ret

View File

@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble scal_kernel_L999
ble .Lscal_kernel_L999
fcmp DA, #0.0
beq scal_kernel_zero
beq .Lscal_kernel_zero
cmp INC_X, #1
bne scal_kernel_S_BEGIN
bne .Lscal_kernel_S_BEGIN
scal_kernel_F_BEGIN:
.Lscal_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq scal_kernel_F1
beq .Lscal_kernel_F1
KERNEL_INIT_F8
scal_kernel_F8:
.Lscal_kernel_F8:
KERNEL_F8
subs I, I, #1
bne scal_kernel_F8
bne .Lscal_kernel_F8
scal_kernel_F1:
.Lscal_kernel_F1:
ands I, N, #7
ble scal_kernel_L999
ble .Lscal_kernel_L999
scal_kernel_F10:
.Lscal_kernel_F10:
KERNEL_F1
subs I, I, #1
bne scal_kernel_F10
bne .Lscal_kernel_F10
mov w0, wzr
ret
scal_kernel_S_BEGIN:
.Lscal_kernel_S_BEGIN:
INIT_S
mov X_COPY, X
asr I, N, #2
cmp I, xzr
ble scal_kernel_S1
ble .Lscal_kernel_S1
scal_kernel_S4:
.Lscal_kernel_S4:
KERNEL_S4
subs I, I, #1
bne scal_kernel_S4
bne .Lscal_kernel_S4
scal_kernel_S1:
.Lscal_kernel_S1:
ands I, N, #3
ble scal_kernel_L999
ble .Lscal_kernel_L999
scal_kernel_S10:
.Lscal_kernel_S10:
KERNEL_S1
subs I, I, #1
bne scal_kernel_S10
bne .Lscal_kernel_S10
scal_kernel_L999:
.Lscal_kernel_L999:
mov w0, wzr
ret
scal_kernel_zero:
.Lscal_kernel_zero:
INIT_S
scal_kernel_Z1:
.Lscal_kernel_Z1:
st1 DAV, [X], INC_X
subs N, N, #1
bne scal_kernel_Z1
bne .Lscal_kernel_Z1
mov w0, wzr
ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble sgemm_kernel_L2_BEGIN
ble .Lsgemm_kernel_L2_BEGIN
/******************************************************************************/
sgemm_kernel_L4_BEGIN:
.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN:
add pA_2, temp, pA_1
add pA_3, temp, pA_2
sgemm_kernel_L4_M16_BEGIN:
.Lsgemm_kernel_L4_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
ble sgemm_kernel_L4_M8_BEGIN
ble .Lsgemm_kernel_L4_M8_BEGIN
sgemm_kernel_L4_M16_20:
.Lsgemm_kernel_L4_M16_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt sgemm_kernel_L4_M16_32
blt .Lsgemm_kernel_L4_M16_32
KERNEL16x4_I // do one in the K
KERNEL16x4_M2 // do another in the K
subs counterL, counterL, #2
ble sgemm_kernel_L4_M16_22a
ble .Lsgemm_kernel_L4_M16_22a
.align 5
sgemm_kernel_L4_M16_22:
.Lsgemm_kernel_L4_M16_22:
KERNEL16x4_M1
KERNEL16x4_M2
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M16_22
bgt .Lsgemm_kernel_L4_M16_22
sgemm_kernel_L4_M16_22a:
.Lsgemm_kernel_L4_M16_22a:
KERNEL16x4_M1
KERNEL16x4_E
b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44
sgemm_kernel_L4_M16_32:
.Lsgemm_kernel_L4_M16_32:
tst counterL, #1
ble sgemm_kernel_L4_M16_40
ble .Lsgemm_kernel_L4_M16_40
KERNEL16x4_I
KERNEL16x4_E
b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44
sgemm_kernel_L4_M16_40:
.Lsgemm_kernel_L4_M16_40:
INIT16x4
sgemm_kernel_L4_M16_44:
.Lsgemm_kernel_L4_M16_44:
ands counterL , origK, #1
ble sgemm_kernel_L4_M16_100
ble .Lsgemm_kernel_L4_M16_100
sgemm_kernel_L4_M16_46:
.Lsgemm_kernel_L4_M16_46:
KERNEL16x4_SUB
sgemm_kernel_L4_M16_100:
.Lsgemm_kernel_L4_M16_100:
SAVE16x4
sgemm_kernel_L4_M16_END:
.Lsgemm_kernel_L4_M16_END:
lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
add pA_0, pA_0, temp
add pA_0, pA_0, temp
@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END:
add pA_2, pA_1, temp
add pA_3, pA_2, temp
subs counterI, counterI, #1
bne sgemm_kernel_L4_M16_20
bne .Lsgemm_kernel_L4_M16_20
sgemm_kernel_L4_M8_BEGIN:
.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
tst counterI , #15
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
tst counterI, #8
ble sgemm_kernel_L4_M4_BEGIN
ble .Lsgemm_kernel_L4_M4_BEGIN
sgemm_kernel_L4_M8_20:
.Lsgemm_kernel_L4_M8_20:
INIT8x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L4_M8_40
ble .Lsgemm_kernel_L4_M8_40
sgemm_kernel_L4_M8_22:
.Lsgemm_kernel_L4_M8_22:
KERNEL8x4_SUB
KERNEL8x4_SUB
@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22:
KERNEL8x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_22
bgt .Lsgemm_kernel_L4_M8_22
sgemm_kernel_L4_M8_40:
.Lsgemm_kernel_L4_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M8_100
ble .Lsgemm_kernel_L4_M8_100
sgemm_kernel_L4_M8_42:
.Lsgemm_kernel_L4_M8_42:
KERNEL8x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_42
bgt .Lsgemm_kernel_L4_M8_42
sgemm_kernel_L4_M8_100:
.Lsgemm_kernel_L4_M8_100:
SAVE8x4
sgemm_kernel_L4_M8_END:
.Lsgemm_kernel_L4_M8_END:
lsl temp, origK, #4 // k * 4 * 4
add pA_0, pA_0, temp
sgemm_kernel_L4_M4_BEGIN:
.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
tst counterI, #4
ble sgemm_kernel_L4_M2_BEGIN
ble .Lsgemm_kernel_L4_M2_BEGIN
sgemm_kernel_L4_M4_20:
.Lsgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L4_M4_40
ble .Lsgemm_kernel_L4_M4_40
sgemm_kernel_L4_M4_22:
.Lsgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_22
bgt .Lsgemm_kernel_L4_M4_22
sgemm_kernel_L4_M4_40:
.Lsgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M4_100
ble .Lsgemm_kernel_L4_M4_100
sgemm_kernel_L4_M4_42:
.Lsgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_42
bgt .Lsgemm_kernel_L4_M4_42
sgemm_kernel_L4_M4_100:
.Lsgemm_kernel_L4_M4_100:
SAVE4x4
sgemm_kernel_L4_M4_END:
.Lsgemm_kernel_L4_M4_END:
sgemm_kernel_L4_M2_BEGIN:
.Lsgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L4_M1_BEGIN
ble .Lsgemm_kernel_L4_M1_BEGIN
sgemm_kernel_L4_M2_20:
.Lsgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L4_M2_40
ble .Lsgemm_kernel_L4_M2_40
sgemm_kernel_L4_M2_22:
.Lsgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_22
bgt .Lsgemm_kernel_L4_M2_22
sgemm_kernel_L4_M2_40:
.Lsgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M2_100
ble .Lsgemm_kernel_L4_M2_100
sgemm_kernel_L4_M2_42:
.Lsgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_42
bgt .Lsgemm_kernel_L4_M2_42
sgemm_kernel_L4_M2_100:
.Lsgemm_kernel_L4_M2_100:
SAVE2x4
sgemm_kernel_L4_M2_END:
.Lsgemm_kernel_L4_M2_END:
sgemm_kernel_L4_M1_BEGIN:
.Lsgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
sgemm_kernel_L4_M1_20:
.Lsgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L4_M1_40
ble .Lsgemm_kernel_L4_M1_40
sgemm_kernel_L4_M1_22:
.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_22
bgt .Lsgemm_kernel_L4_M1_22
sgemm_kernel_L4_M1_40:
.Lsgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M1_100
ble .Lsgemm_kernel_L4_M1_100
sgemm_kernel_L4_M1_42:
.Lsgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_42
bgt .Lsgemm_kernel_L4_M1_42
sgemm_kernel_L4_M1_100:
.Lsgemm_kernel_L4_M1_100:
SAVE1x4
sgemm_kernel_L4_END:
.Lsgemm_kernel_L4_END:
lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4
subs counterJ, counterJ , #1 // j--
bgt sgemm_kernel_L4_BEGIN
bgt .Lsgemm_kernel_L4_BEGIN
/******************************************************************************/
sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble sgemm_kernel_L999
ble .Lsgemm_kernel_L999
tst counterJ , #2
ble sgemm_kernel_L1_BEGIN
ble .Lsgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
sgemm_kernel_L2_M4_BEGIN:
.Lsgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble sgemm_kernel_L2_M2_BEGIN
ble .Lsgemm_kernel_L2_M2_BEGIN
sgemm_kernel_L2_M4_20:
.Lsgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble sgemm_kernel_L2_M4_40
ble .Lsgemm_kernel_L2_M4_40
.align 5
sgemm_kernel_L2_M4_22:
.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_22
bgt .Lsgemm_kernel_L2_M4_22
sgemm_kernel_L2_M4_40:
.Lsgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M4_100
ble .Lsgemm_kernel_L2_M4_100
sgemm_kernel_L2_M4_42:
.Lsgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_42
bgt .Lsgemm_kernel_L2_M4_42
sgemm_kernel_L2_M4_100:
.Lsgemm_kernel_L2_M4_100:
SAVE4x2
sgemm_kernel_L2_M4_END:
.Lsgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt sgemm_kernel_L2_M4_20
bgt .Lsgemm_kernel_L2_M4_20
sgemm_kernel_L2_M2_BEGIN:
.Lsgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L2_M1_BEGIN
ble .Lsgemm_kernel_L2_M1_BEGIN
sgemm_kernel_L2_M2_20:
.Lsgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble sgemm_kernel_L2_M2_40
ble .Lsgemm_kernel_L2_M2_40
sgemm_kernel_L2_M2_22:
.Lsgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_22
bgt .Lsgemm_kernel_L2_M2_22
sgemm_kernel_L2_M2_40:
.Lsgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M2_100
ble .Lsgemm_kernel_L2_M2_100
sgemm_kernel_L2_M2_42:
.Lsgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_42
bgt .Lsgemm_kernel_L2_M2_42
sgemm_kernel_L2_M2_100:
.Lsgemm_kernel_L2_M2_100:
SAVE2x2
sgemm_kernel_L2_M2_END:
.Lsgemm_kernel_L2_M2_END:
sgemm_kernel_L2_M1_BEGIN:
.Lsgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END
sgemm_kernel_L2_M1_20:
.Lsgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L2_M1_40
ble .Lsgemm_kernel_L2_M1_40
sgemm_kernel_L2_M1_22:
.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_22
bgt .Lsgemm_kernel_L2_M1_22
sgemm_kernel_L2_M1_40:
.Lsgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M1_100
ble .Lsgemm_kernel_L2_M1_100
sgemm_kernel_L2_M1_42:
.Lsgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_42
bgt .Lsgemm_kernel_L2_M1_42
sgemm_kernel_L2_M1_100:
.Lsgemm_kernel_L2_M1_100:
SAVE1x2
sgemm_kernel_L2_END:
.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
sgemm_kernel_L1_BEGIN:
.Lsgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble sgemm_kernel_L999 // done
ble .Lsgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN:
sgemm_kernel_L1_M4_BEGIN:
.Lsgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble sgemm_kernel_L1_M2_BEGIN
ble .Lsgemm_kernel_L1_M2_BEGIN
sgemm_kernel_L1_M4_20:
.Lsgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M4_40
ble .Lsgemm_kernel_L1_M4_40
.align 5
sgemm_kernel_L1_M4_22:
.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_22
bgt .Lsgemm_kernel_L1_M4_22
sgemm_kernel_L1_M4_40:
.Lsgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M4_100
ble .Lsgemm_kernel_L1_M4_100
sgemm_kernel_L1_M4_42:
.Lsgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_42
bgt .Lsgemm_kernel_L1_M4_42
sgemm_kernel_L1_M4_100:
.Lsgemm_kernel_L1_M4_100:
SAVE4x1
sgemm_kernel_L1_M4_END:
.Lsgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt sgemm_kernel_L1_M4_20
bgt .Lsgemm_kernel_L1_M4_20
sgemm_kernel_L1_M2_BEGIN:
.Lsgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L1_M1_BEGIN
ble .Lsgemm_kernel_L1_M1_BEGIN
sgemm_kernel_L1_M2_20:
.Lsgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M2_40
ble .Lsgemm_kernel_L1_M2_40
sgemm_kernel_L1_M2_22:
.Lsgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_22
bgt .Lsgemm_kernel_L1_M2_22
sgemm_kernel_L1_M2_40:
.Lsgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M2_100
ble .Lsgemm_kernel_L1_M2_100
sgemm_kernel_L1_M2_42:
.Lsgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_42
bgt .Lsgemm_kernel_L1_M2_42
sgemm_kernel_L1_M2_100:
.Lsgemm_kernel_L1_M2_100:
SAVE2x1
sgemm_kernel_L1_M2_END:
.Lsgemm_kernel_L1_M2_END:
sgemm_kernel_L1_M1_BEGIN:
.Lsgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END
sgemm_kernel_L1_M1_20:
.Lsgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M1_40
ble .Lsgemm_kernel_L1_M1_40
sgemm_kernel_L1_M1_22:
.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_22
bgt .Lsgemm_kernel_L1_M1_22
sgemm_kernel_L1_M1_40:
.Lsgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M1_100
ble .Lsgemm_kernel_L1_M1_100
sgemm_kernel_L1_M1_42:
.Lsgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_42
bgt .Lsgemm_kernel_L1_M1_42
sgemm_kernel_L1_M1_100:
.Lsgemm_kernel_L1_M1_100:
SAVE1x1
sgemm_kernel_L1_END:
.Lsgemm_kernel_L1_END:
sgemm_kernel_L999:
.Lsgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
strmm_kernel_begin:
.Lstrmm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
@ -539,11 +539,11 @@ strmm_kernel_begin:
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble strmm_kernel_L2_BEGIN
ble .Lstrmm_kernel_L2_BEGIN
/******************************************************************************/
strmm_kernel_L4_BEGIN:
.Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
strmm_kernel_L4_M4_BEGIN:
.Lstrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble strmm_kernel_L4_M2_BEGIN
ble .Lstrmm_kernel_L4_M2_BEGIN
strmm_kernel_L4_M4_20:
.Lstrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt strmm_kernel_L4_M4_32
blt .Lstrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble strmm_kernel_L4_M4_22a
ble .Lstrmm_kernel_L4_M4_22a
.align 5
strmm_kernel_L4_M4_22:
.Lstrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt strmm_kernel_L4_M4_22
bgt .Lstrmm_kernel_L4_M4_22
strmm_kernel_L4_M4_22a:
.Lstrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44
strmm_kernel_L4_M4_32:
.Lstrmm_kernel_L4_M4_32:
tst counterL, #1
ble strmm_kernel_L4_M4_40
ble .Lstrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44
strmm_kernel_L4_M4_40:
.Lstrmm_kernel_L4_M4_40:
INIT4x4
strmm_kernel_L4_M4_44:
.Lstrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble strmm_kernel_L4_M4_100
ble .Lstrmm_kernel_L4_M4_100
strmm_kernel_L4_M4_46:
.Lstrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
strmm_kernel_L4_M4_100:
.Lstrmm_kernel_L4_M4_100:
SAVE4x4
@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
strmm_kernel_L4_M4_END:
.Lstrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne strmm_kernel_L4_M4_20
bne .Lstrmm_kernel_L4_M4_20
strmm_kernel_L4_M2_BEGIN:
.Lstrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L4_M1_BEGIN
ble .Lstrmm_kernel_L4_M1_BEGIN
strmm_kernel_L4_M2_20:
.Lstrmm_kernel_L4_M2_20:
INIT2x4
@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L4_M2_40
ble .Lstrmm_kernel_L4_M2_40
strmm_kernel_L4_M2_22:
.Lstrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_22
bgt .Lstrmm_kernel_L4_M2_22
strmm_kernel_L4_M2_40:
.Lstrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M2_100
ble .Lstrmm_kernel_L4_M2_100
strmm_kernel_L4_M2_42:
.Lstrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_42
bgt .Lstrmm_kernel_L4_M2_42
strmm_kernel_L4_M2_100:
.Lstrmm_kernel_L4_M2_100:
SAVE2x4
@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100:
#endif
strmm_kernel_L4_M2_END:
.Lstrmm_kernel_L4_M2_END:
strmm_kernel_L4_M1_BEGIN:
.Lstrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END
strmm_kernel_L4_M1_20:
.Lstrmm_kernel_L4_M1_20:
INIT1x4
@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L4_M1_40
ble .Lstrmm_kernel_L4_M1_40
strmm_kernel_L4_M1_22:
.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_22
bgt .Lstrmm_kernel_L4_M1_22
strmm_kernel_L4_M1_40:
.Lstrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M1_100
ble .Lstrmm_kernel_L4_M1_100
strmm_kernel_L4_M1_42:
.Lstrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_42
bgt .Lstrmm_kernel_L4_M1_42
strmm_kernel_L4_M1_100:
.Lstrmm_kernel_L4_M1_100:
SAVE1x4
@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100:
#endif
strmm_kernel_L4_END:
.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
#if !defined(LEFT)
@ -825,19 +825,19 @@ strmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt strmm_kernel_L4_BEGIN
bgt .Lstrmm_kernel_L4_BEGIN
/******************************************************************************/
strmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble strmm_kernel_L999
ble .Lstrmm_kernel_L999
tst counterJ , #2
ble strmm_kernel_L1_BEGIN
ble .Lstrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
strmm_kernel_L2_M4_BEGIN:
.Lstrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble strmm_kernel_L2_M2_BEGIN
ble .Lstrmm_kernel_L2_M2_BEGIN
strmm_kernel_L2_M4_20:
.Lstrmm_kernel_L2_M4_20:
INIT4x2
@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble strmm_kernel_L2_M4_40
ble .Lstrmm_kernel_L2_M4_40
.align 5
strmm_kernel_L2_M4_22:
.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_22
bgt .Lstrmm_kernel_L2_M4_22
strmm_kernel_L2_M4_40:
.Lstrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M4_100
ble .Lstrmm_kernel_L2_M4_100
strmm_kernel_L2_M4_42:
.Lstrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_42
bgt .Lstrmm_kernel_L2_M4_42
strmm_kernel_L2_M4_100:
.Lstrmm_kernel_L2_M4_100:
SAVE4x2
@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
strmm_kernel_L2_M4_END:
.Lstrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt strmm_kernel_L2_M4_20
bgt .Lstrmm_kernel_L2_M4_20
strmm_kernel_L2_M2_BEGIN:
.Lstrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L2_M1_BEGIN
ble .Lstrmm_kernel_L2_M1_BEGIN
strmm_kernel_L2_M2_20:
.Lstrmm_kernel_L2_M2_20:
INIT2x2
@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble strmm_kernel_L2_M2_40
ble .Lstrmm_kernel_L2_M2_40
strmm_kernel_L2_M2_22:
.Lstrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_22
bgt .Lstrmm_kernel_L2_M2_22
strmm_kernel_L2_M2_40:
.Lstrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M2_100
ble .Lstrmm_kernel_L2_M2_100
strmm_kernel_L2_M2_42:
.Lstrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_42
bgt .Lstrmm_kernel_L2_M2_42
strmm_kernel_L2_M2_100:
.Lstrmm_kernel_L2_M2_100:
SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
strmm_kernel_L2_M2_END:
.Lstrmm_kernel_L2_M2_END:
strmm_kernel_L2_M1_BEGIN:
.Lstrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END
strmm_kernel_L2_M1_20:
.Lstrmm_kernel_L2_M1_20:
INIT1x2
@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble strmm_kernel_L2_M1_40
ble .Lstrmm_kernel_L2_M1_40
strmm_kernel_L2_M1_22:
.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_22
bgt .Lstrmm_kernel_L2_M1_22
strmm_kernel_L2_M1_40:
.Lstrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M1_100
ble .Lstrmm_kernel_L2_M1_100
strmm_kernel_L2_M1_42:
.Lstrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_42
bgt .Lstrmm_kernel_L2_M1_42
strmm_kernel_L2_M1_100:
.Lstrmm_kernel_L2_M1_100:
SAVE1x2
@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
strmm_kernel_L2_END:
.Lstrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1107,11 +1107,11 @@ strmm_kernel_L2_END:
/******************************************************************************/
strmm_kernel_L1_BEGIN:
.Lstrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble strmm_kernel_L999 // done
ble .Lstrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
strmm_kernel_L1_M4_BEGIN:
.Lstrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble strmm_kernel_L1_M2_BEGIN
ble .Lstrmm_kernel_L1_M2_BEGIN
strmm_kernel_L1_M4_20:
.Lstrmm_kernel_L1_M4_20:
INIT4x1
@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M4_40
ble .Lstrmm_kernel_L1_M4_40
.align 5
strmm_kernel_L1_M4_22:
.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_22
bgt .Lstrmm_kernel_L1_M4_22
strmm_kernel_L1_M4_40:
.Lstrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M4_100
ble .Lstrmm_kernel_L1_M4_100
strmm_kernel_L1_M4_42:
.Lstrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_42
bgt .Lstrmm_kernel_L1_M4_42
strmm_kernel_L1_M4_100:
.Lstrmm_kernel_L1_M4_100:
SAVE4x1
@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
strmm_kernel_L1_M4_END:
.Lstrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt strmm_kernel_L1_M4_20
bgt .Lstrmm_kernel_L1_M4_20
strmm_kernel_L1_M2_BEGIN:
.Lstrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L1_M1_BEGIN
ble .Lstrmm_kernel_L1_M1_BEGIN
strmm_kernel_L1_M2_20:
.Lstrmm_kernel_L1_M2_20:
INIT2x1
@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M2_40
ble .Lstrmm_kernel_L1_M2_40
strmm_kernel_L1_M2_22:
.Lstrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_22
bgt .Lstrmm_kernel_L1_M2_22
strmm_kernel_L1_M2_40:
.Lstrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M2_100
ble .Lstrmm_kernel_L1_M2_100
strmm_kernel_L1_M2_42:
.Lstrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_42
bgt .Lstrmm_kernel_L1_M2_42
strmm_kernel_L1_M2_100:
.Lstrmm_kernel_L1_M2_100:
SAVE2x1
@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100:
#endif
strmm_kernel_L1_M2_END:
.Lstrmm_kernel_L1_M2_END:
strmm_kernel_L1_M1_BEGIN:
.Lstrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END
strmm_kernel_L1_M1_20:
.Lstrmm_kernel_L1_M1_20:
INIT1x1
@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M1_40
ble .Lstrmm_kernel_L1_M1_40
strmm_kernel_L1_M1_22:
.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_22
bgt .Lstrmm_kernel_L1_M1_22
strmm_kernel_L1_M1_40:
.Lstrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M1_100
ble .Lstrmm_kernel_L1_M1_100
strmm_kernel_L1_M1_42:
.Lstrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_42
bgt .Lstrmm_kernel_L1_M1_42
strmm_kernel_L1_M1_100:
.Lstrmm_kernel_L1_M1_100:
SAVE1x1
@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100:
#endif
#endif
strmm_kernel_L1_END:
.Lstrmm_kernel_L1_END:
#if 0
#if !defined(LEFT)
@ -1385,7 +1385,7 @@ strmm_kernel_L1_END:
#endif
#endif
strmm_kernel_L999:
.Lstrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

File diff suppressed because it is too large Load Diff

View File

@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble swap_kernel_L999
ble .Lswap_kernel_L999
cmp INC_X, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
swap_kernel_F_BEGIN:
.Lswap_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq swap_kernel_F1
beq .Lswap_kernel_F1
swap_kernel_F8:
.Lswap_kernel_F8:
KERNEL_F8
subs I, I, #1
bne swap_kernel_F8
bne .Lswap_kernel_F8
swap_kernel_F1:
.Lswap_kernel_F1:
ands I, N, #7
ble swap_kernel_L999
ble .Lswap_kernel_L999
swap_kernel_F10:
.Lswap_kernel_F10:
KERNEL_F1
subs I, I, #1
bne swap_kernel_F10
bne .Lswap_kernel_F10
b swap_kernel_L999
b .Lswap_kernel_L999
swap_kernel_S_BEGIN:
.Lswap_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble swap_kernel_S1
ble .Lswap_kernel_S1
swap_kernel_S4:
.Lswap_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -244,21 +244,21 @@ swap_kernel_S4:
KERNEL_S1
subs I, I, #1
bne swap_kernel_S4
bne .Lswap_kernel_S4
swap_kernel_S1:
.Lswap_kernel_S1:
ands I, N, #3
ble swap_kernel_L999
ble .Lswap_kernel_L999
swap_kernel_S10:
.Lswap_kernel_S10:
KERNEL_S1
subs I, I, #1
bne swap_kernel_S10
bne .Lswap_kernel_S10
swap_kernel_L999:
.Lswap_kernel_L999:
mov w0, wzr
ret

View File

@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero
cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lzamax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
.Lzamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lzamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq amax_kernel_F1
beq .Lzamax_kernel_F1
amax_kernel_F4:
.Lzamax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne amax_kernel_F4
bne .Lzamax_kernel_F4
amax_kernel_F1:
.Lzamax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999
amax_kernel_F10:
.Lzamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
bne .Lzamax_kernel_F10
ret
amax_kernel_F1_INIT:
.Lzamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b amax_kernel_F1
b .Lzamax_kernel_F1
amax_kernel_S_BEGIN:
.Lzamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble amax_kernel_L999
ble .Lzamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
ble .Lzamax_kernel_S1
amax_kernel_S4:
.Lzamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -247,25 +247,25 @@ amax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
bne .Lzamax_kernel_S4
amax_kernel_S1:
.Lzamax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999
amax_kernel_S10:
.Lzamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
bne .Lzamax_kernel_S10
amax_kernel_L999:
.Lzamax_kernel_L999:
ret
amax_kernel_zero:
.Lzamax_kernel_zero:
fmov MAXF, REG0
ret

View File

@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov SUMF, REG0
cmp N, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lzasum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
.Lzasum_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq asum_kernel_F1
beq .Lzasum_kernel_F1
asum_kernel_F4:
.Lzasum_kernel_F4:
KERNEL_F4
subs I, I, #1
bne asum_kernel_F4
bne .Lzasum_kernel_F4
KERNEL_F4_FINALIZE
asum_kernel_F1:
.Lzasum_kernel_F1:
ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999
asum_kernel_F10:
.Lzasum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
bne .Lzasum_kernel_F10
asum_kernel_L999:
.Lzasum_kernel_L999:
ret
asum_kernel_S_BEGIN:
.Lzasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lzasum_kernel_S1
asum_kernel_S4:
.Lzasum_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -145,19 +145,19 @@ asum_kernel_S4:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
bne .Lzasum_kernel_S4
asum_kernel_S1:
.Lzasum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999
asum_kernel_S10:
.Lzasum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
bne .Lzasum_kernel_S10
ret

View File

@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999
mov Y_COPY, Y
fcmp DA_R, #0.0
bne .L1
fcmp DA_I, #0.0
beq zaxpy_kernel_L999
beq .Lzaxpy_kernel_L999
.L1:
INIT
cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN
zaxpy_kernel_F_BEGIN:
.Lzaxpy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq zaxpy_kernel_F1
beq .Lzaxpy_kernel_F1
KERNEL_INIT_F4
zaxpy_kernel_F4:
.Lzaxpy_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zaxpy_kernel_F4
bne .Lzaxpy_kernel_F4
zaxpy_kernel_F1:
.Lzaxpy_kernel_F1:
ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999
zaxpy_kernel_F10:
.Lzaxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zaxpy_kernel_F10
bne .Lzaxpy_kernel_F10
mov w0, wzr
ret
zaxpy_kernel_S_BEGIN:
.Lzaxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble zaxpy_kernel_S1
ble .Lzaxpy_kernel_S1
zaxpy_kernel_S4:
.Lzaxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -304,21 +304,21 @@ zaxpy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zaxpy_kernel_S4
bne .Lzaxpy_kernel_S4
zaxpy_kernel_S1:
.Lzaxpy_kernel_S1:
ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999
zaxpy_kernel_S10:
.Lzaxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zaxpy_kernel_S10
bne .Lzaxpy_kernel_S10
zaxpy_kernel_L999:
.Lzaxpy_kernel_L999:
mov w0, wzr
ret

View File

@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
cmp N, xzr
ble dot_kernel_L999
ble .Lzdot_kernel_L999
cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN
dot_kernel_F_BEGIN:
.Lzdot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
beq .Lzdot_kernel_F1
dot_kernel_F4:
.Lzdot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne dot_kernel_F4
bne .Lzdot_kernel_F4
KERNEL_F4_FINALIZE
dot_kernel_F1:
.Lzdot_kernel_F1:
ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999
dot_kernel_F10:
.Lzdot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne dot_kernel_F10
bne .Lzdot_kernel_F10
ret
dot_kernel_S_BEGIN:
.Lzdot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
ble .Lzdot_kernel_S1
dot_kernel_S4:
.Lzdot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -281,21 +281,21 @@ dot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S4
bne .Lzdot_kernel_S4
dot_kernel_S1:
.Lzdot_kernel_S1:
ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999
dot_kernel_S10:
.Lzdot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S10
bne .Lzdot_kernel_S10
dot_kernel_L999:
.Lzdot_kernel_L999:
ret

View File

@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a
.align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22
.align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
.align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:
INIT4x4
zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100
.align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4
zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20
zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN
zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40
zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22
zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100
zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42
zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:
SAVE2x4
zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:
zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40
zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22
zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100
zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42
zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:
SAVE1x4
zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/
zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999
tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN
zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5
zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22
zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100
zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42
zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:
SAVE4x2
zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20
zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN
zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40
zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22
zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100
zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42
zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:
SAVE2x2
zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:
zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40
zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22
zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100
zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42
zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:
SAVE1x2
zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/
zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN:
zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN
zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5
zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22
zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100
zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42
zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:
SAVE4x1
zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20
zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN
zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40
zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22
zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100
zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42
zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:
SAVE2x1
zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:
zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40
zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22
zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100
zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42
zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:
SAVE1x1
zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:
zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a
.align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22
.align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
.align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:
INIT4x4
zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100
.align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4
zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20
zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN
zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40
zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22
zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100
zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42
zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:
SAVE2x4
zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:
zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40
zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22
zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100
zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42
zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:
SAVE1x4
zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/
zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999
tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN
zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5
zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22
zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100
zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42
zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:
SAVE4x2
zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20
zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN
zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40
zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22
zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100
zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42
zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:
SAVE2x2
zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:
zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40
zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22
zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100
zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42
zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:
SAVE1x2
zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/
zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN:
zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN
zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5
zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22
zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100
zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42
zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:
SAVE4x1
zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20
zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN
zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40
zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22
zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100
zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42
zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:
SAVE2x1
zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:
zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40
zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22
zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100
zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42
zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:
SAVE1x1
zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:
zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999
cmp M, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT
cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN
bne .Lzgemv_n_kernel_S_BEGIN
zgemv_n_kernel_F_LOOP:
.Lzgemv_n_kernel_F_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP:
asr I, M, #2
cmp I, xzr
beq zgemv_n_kernel_F1
beq .Lzgemv_n_kernel_F1
zgemv_n_kernel_F4:
.Lzgemv_n_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zgemv_n_kernel_F4
bne .Lzgemv_n_kernel_F4
zgemv_n_kernel_F1:
.Lzgemv_n_kernel_F1:
ands I, M, #3
ble zgemv_n_kernel_F_END
ble .Lzgemv_n_kernel_F_END
zgemv_n_kernel_F10:
.Lzgemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zgemv_n_kernel_F10
bne .Lzgemv_n_kernel_F10
zgemv_n_kernel_F_END:
.Lzgemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_F_LOOP
bne .Lzgemv_n_kernel_F_LOOP
b zgemv_n_kernel_L999
b .Lzgemv_n_kernel_L999
zgemv_n_kernel_S_BEGIN:
.Lzgemv_n_kernel_S_BEGIN:
INIT_S
zgemv_n_kernel_S_LOOP:
.Lzgemv_n_kernel_S_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble zgemv_n_kernel_S1
ble .Lzgemv_n_kernel_S1
zgemv_n_kernel_S4:
.Lzgemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -440,27 +440,27 @@ zgemv_n_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zgemv_n_kernel_S4
bne .Lzgemv_n_kernel_S4
zgemv_n_kernel_S1:
.Lzgemv_n_kernel_S1:
ands I, M, #3
ble zgemv_n_kernel_S_END
ble .Lzgemv_n_kernel_S_END
zgemv_n_kernel_S10:
.Lzgemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zgemv_n_kernel_S10
bne .Lzgemv_n_kernel_S10
zgemv_n_kernel_S_END:
.Lzgemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_S_LOOP
bne .Lzgemv_n_kernel_S_LOOP
zgemv_n_kernel_L999:
.Lzgemv_n_kernel_L999:
RESTORE_REGS
mov w0, wzr

View File

@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999
cmp M, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT
cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN
bne .Lzgemv_t_kernel_S_BEGIN
zgemv_t_kernel_F_LOOP:
.Lzgemv_t_kernel_F_LOOP:
mov A_PTR, A
mov X_PTR, X
@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP:
asr I, M, #2
cmp I, xzr
beq zgemv_t_kernel_F1
beq .Lzgemv_t_kernel_F1
zgemv_t_kernel_F4:
.Lzgemv_t_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zgemv_t_kernel_F4
bne .Lzgemv_t_kernel_F4
KERNEL_F4_FINALIZE
zgemv_t_kernel_F1:
.Lzgemv_t_kernel_F1:
ands I, M, #3
ble zgemv_t_kernel_F_END
ble .Lzgemv_t_kernel_F_END
zgemv_t_kernel_F10:
.Lzgemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zgemv_t_kernel_F10
bne .Lzgemv_t_kernel_F10
zgemv_t_kernel_F_END:
.Lzgemv_t_kernel_F_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
@ -355,15 +355,15 @@ zgemv_t_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_F_LOOP
bne .Lzgemv_t_kernel_F_LOOP
b zgemv_t_kernel_L999
b .Lzgemv_t_kernel_L999
zgemv_t_kernel_S_BEGIN:
.Lzgemv_t_kernel_S_BEGIN:
INIT_S
zgemv_t_kernel_S_LOOP:
.Lzgemv_t_kernel_S_LOOP:
mov A_PTR, A
mov X_PTR, X
@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble zgemv_t_kernel_S1
ble .Lzgemv_t_kernel_S1
zgemv_t_kernel_S4:
.Lzgemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -381,21 +381,21 @@ zgemv_t_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zgemv_t_kernel_S4
bne .Lzgemv_t_kernel_S4
zgemv_t_kernel_S1:
.Lzgemv_t_kernel_S1:
ands I, M, #3
ble zgemv_t_kernel_S_END
ble .Lzgemv_t_kernel_S_END
zgemv_t_kernel_S10:
.Lzgemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zgemv_t_kernel_S10
bne .Lzgemv_t_kernel_S10
zgemv_t_kernel_S_END:
.Lzgemv_t_kernel_S_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
@ -413,9 +413,9 @@ zgemv_t_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_S_LOOP
bne .Lzgemv_t_kernel_S_LOOP
zgemv_t_kernel_L999:
.Lzgemv_t_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret

View File

@ -226,43 +226,43 @@ KERNEL_S1_END_\@:
INIT
cmp N, #0
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999
cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lznrm2_kernel_L999
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lznrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
.Lznrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, xzr
ble nrm2_kernel_F1
ble .Lznrm2_kernel_F1
nrm2_kernel_F8:
.Lznrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
bne .Lznrm2_kernel_F8
nrm2_kernel_F1:
.Lznrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999
nrm2_kernel_F10:
.Lznrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
bne .Lznrm2_kernel_F10
b nrm2_kernel_L999
b .Lznrm2_kernel_L999
nrm2_kernel_S_BEGIN:
.Lznrm2_kernel_S_BEGIN:
INIT_S
@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN:
.align 5
nrm2_kernel_S10:
.Lznrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
bne .Lznrm2_kernel_S10
nrm2_kernel_L999:
.Lznrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ

View File

@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble rot_kernel_L999
ble .Lzrot_kernel_L999
INIT
cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN
rot_kernel_F_BEGIN:
.Lzrot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
beq .Lzrot_kernel_F1
KERNEL_INIT_F4
rot_kernel_F4:
.Lzrot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne rot_kernel_F4
bne .Lzrot_kernel_F4
rot_kernel_F1:
.Lzrot_kernel_F1:
ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999
rot_kernel_F10:
.Lzrot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne rot_kernel_F10
bne .Lzrot_kernel_F10
mov w0, wzr
ret
rot_kernel_S_BEGIN:
.Lzrot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
ble .Lzrot_kernel_S1
rot_kernel_S4:
.Lzrot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -236,21 +236,21 @@ rot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S4
bne .Lzrot_kernel_S4
rot_kernel_S1:
.Lzrot_kernel_S1:
ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999
rot_kernel_S10:
.Lzrot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S10
bne .Lzrot_kernel_S10
rot_kernel_L999:
.Lzrot_kernel_L999:
mov w0, wzr
ret

View File

@ -215,71 +215,71 @@ zscal_begin:
mov X_COPY, X
cmp N, xzr
ble zscal_kernel_L999
ble .Lzscal_kernel_L999
fcmp DA_R, #0.0
bne zscal_kernel_R_non_zero
bne .Lzscal_kernel_R_non_zero
fcmp DA_I, #0.0
beq zscal_kernel_RI_zero
beq .Lzscal_kernel_RI_zero
b zscal_kernel_R_zero
b .Lzscal_kernel_R_zero
zscal_kernel_R_non_zero:
.Lzscal_kernel_R_non_zero:
fcmp DA_I, #0.0
beq zscal_kernel_I_zero
beq .Lzscal_kernel_I_zero
/*******************************************************************************
* A_R != 0 && A_I != 0
*******************************************************************************/
zscal_kernel_RI_non_zero:
.Lzscal_kernel_RI_non_zero:
INIT
cmp INC_X, #1
bne zscal_kernel_S_BEGIN
bne .Lzscal_kernel_S_BEGIN
zscal_kernel_F_BEGIN:
.Lzscal_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq zscal_kernel_F1
beq .Lzscal_kernel_F1
KERNEL_INIT_F4
zscal_kernel_F4:
.Lzscal_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zscal_kernel_F4
bne .Lzscal_kernel_F4
zscal_kernel_F1:
.Lzscal_kernel_F1:
ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999
zscal_kernel_F10:
.Lzscal_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zscal_kernel_F10
bne .Lzscal_kernel_F10
mov w0, wzr
ret
zscal_kernel_S_BEGIN:
.Lzscal_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble zscal_kernel_S1
ble .Lzscal_kernel_S1
zscal_kernel_S4:
.Lzscal_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -287,21 +287,21 @@ zscal_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zscal_kernel_S4
bne .Lzscal_kernel_S4
zscal_kernel_S1:
.Lzscal_kernel_S1:
ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999
zscal_kernel_S10:
.Lzscal_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zscal_kernel_S10
bne .Lzscal_kernel_S10
zscal_kernel_L999:
.Lzscal_kernel_L999:
mov w0, wzr
ret
@ -310,7 +310,7 @@ zscal_kernel_L999:
* A_R == 0 && A_I != 0
*******************************************************************************/
zscal_kernel_R_zero:
.Lzscal_kernel_R_zero:
INIT_S
#if !defined(DOUBLE)
@ -323,7 +323,7 @@ zscal_kernel_R_zero:
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif
zscal_kernel_R_zero_1:
.Lzscal_kernel_R_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
@ -337,7 +337,7 @@ zscal_kernel_R_zero_1:
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_R_zero_1
bne .Lzscal_kernel_R_zero_1
mov w0, wzr
ret
@ -346,7 +346,7 @@ zscal_kernel_R_zero_1:
* A_R != 0 && A_I == 0
*******************************************************************************/
zscal_kernel_I_zero:
.Lzscal_kernel_I_zero:
INIT_S
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
@ -354,7 +354,7 @@ zscal_kernel_I_zero:
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif
zscal_kernel_I_zero_1:
.Lzscal_kernel_I_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
@ -366,7 +366,7 @@ zscal_kernel_I_zero_1:
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_I_zero_1
bne .Lzscal_kernel_I_zero_1
mov w0, wzr
ret
@ -375,16 +375,16 @@ zscal_kernel_I_zero_1:
* A_R == 0 && A_I == 0
*******************************************************************************/
zscal_kernel_RI_zero:
.Lzscal_kernel_RI_zero:
INIT_S
zscal_kernel_RI_zero_1:
.Lzscal_kernel_RI_zero_1:
stp DA_R, DA_I, [X]
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_RI_zero_1
bne .Lzscal_kernel_RI_zero_1
mov w0, wzr
ret

View File

@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ztrmm_kernel_L2_BEGIN
ble .Lztrmm_kernel_L2_BEGIN
ztrmm_kernel_L4_BEGIN:
.Lztrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
ztrmm_kernel_L4_M4_BEGIN:
.Lztrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
ble .Lztrmm_kernel_L4_M2_BEGIN
.align 5
ztrmm_kernel_L4_M4_20:
.Lztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20:
asr counterL , tempK, #3
cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
blt .Lztrmm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20:
KERNEL4x4_M2
subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
ble .Lztrmm_kernel_L4_M4_22a
.align 5
ztrmm_kernel_L4_M4_22:
.Lztrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22:
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
bgt .Lztrmm_kernel_L4_M4_22
.align 5
ztrmm_kernel_L4_M4_22a:
.Lztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44
.align 5
ztrmm_kernel_L4_M4_32:
.Lztrmm_kernel_L4_M4_32:
tst counterL, #1
ble ztrmm_kernel_L4_M4_40
ble .Lztrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44
ztrmm_kernel_L4_M4_40:
.Lztrmm_kernel_L4_M4_40:
INIT4x4
ztrmm_kernel_L4_M4_44:
.Lztrmm_kernel_L4_M4_44:
ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
ble .Lztrmm_kernel_L4_M4_100
.align 5
ztrmm_kernel_L4_M4_46:
.Lztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
bne .Lztrmm_kernel_L4_M4_46
ztrmm_kernel_L4_M4_100:
.Lztrmm_kernel_L4_M4_100:
SAVE4x4
@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
ztrmm_kernel_L4_M4_END:
.Lztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20
bne .Lztrmm_kernel_L4_M4_20
ztrmm_kernel_L4_M2_BEGIN:
.Lztrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L4_M1_BEGIN
ble .Lztrmm_kernel_L4_M1_BEGIN
ztrmm_kernel_L4_M2_20:
.Lztrmm_kernel_L4_M2_20:
INIT2x4
@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L4_M2_40
ble .Lztrmm_kernel_L4_M2_40
ztrmm_kernel_L4_M2_22:
.Lztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_22
bgt .Lztrmm_kernel_L4_M2_22
ztrmm_kernel_L4_M2_40:
.Lztrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M2_100
ble .Lztrmm_kernel_L4_M2_100
ztrmm_kernel_L4_M2_42:
.Lztrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_42
bgt .Lztrmm_kernel_L4_M2_42
ztrmm_kernel_L4_M2_100:
.Lztrmm_kernel_L4_M2_100:
SAVE2x4
@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
ztrmm_kernel_L4_M2_END:
.Lztrmm_kernel_L4_M2_END:
ztrmm_kernel_L4_M1_BEGIN:
.Lztrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END
ztrmm_kernel_L4_M1_20:
.Lztrmm_kernel_L4_M1_20:
INIT1x4
@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L4_M1_40
ble .Lztrmm_kernel_L4_M1_40
ztrmm_kernel_L4_M1_22:
.Lztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_22
bgt .Lztrmm_kernel_L4_M1_22
ztrmm_kernel_L4_M1_40:
.Lztrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M1_100
ble .Lztrmm_kernel_L4_M1_100
ztrmm_kernel_L4_M1_42:
.Lztrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_42
bgt .Lztrmm_kernel_L4_M1_42
ztrmm_kernel_L4_M1_100:
.Lztrmm_kernel_L4_M1_100:
SAVE1x4
@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100:
#endif
ztrmm_kernel_L4_END:
.Lztrmm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt ztrmm_kernel_L4_BEGIN
bgt .Lztrmm_kernel_L4_BEGIN
/******************************************************************************/
ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble ztrmm_kernel_L999 // error, N was less than 4?
ble .Lztrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble ztrmm_kernel_L1_BEGIN
ble .Lztrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
ztrmm_kernel_L2_M4_BEGIN:
.Lztrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble ztrmm_kernel_L2_M2_BEGIN
ble .Lztrmm_kernel_L2_M2_BEGIN
ztrmm_kernel_L2_M4_20:
.Lztrmm_kernel_L2_M4_20:
INIT4x2
@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ztrmm_kernel_L2_M4_40
ble .Lztrmm_kernel_L2_M4_40
.align 5
ztrmm_kernel_L2_M4_22:
.Lztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_22
bgt .Lztrmm_kernel_L2_M4_22
ztrmm_kernel_L2_M4_40:
.Lztrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M4_100
ble .Lztrmm_kernel_L2_M4_100
ztrmm_kernel_L2_M4_42:
.Lztrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_42
bgt .Lztrmm_kernel_L2_M4_42
ztrmm_kernel_L2_M4_100:
.Lztrmm_kernel_L2_M4_100:
SAVE4x2
@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
ztrmm_kernel_L2_M4_END:
.Lztrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt ztrmm_kernel_L2_M4_20
bgt .Lztrmm_kernel_L2_M4_20
ztrmm_kernel_L2_M2_BEGIN:
.Lztrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L2_M1_BEGIN
ble .Lztrmm_kernel_L2_M1_BEGIN
ztrmm_kernel_L2_M2_20:
.Lztrmm_kernel_L2_M2_20:
INIT2x2
@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ztrmm_kernel_L2_M2_40
ble .Lztrmm_kernel_L2_M2_40
ztrmm_kernel_L2_M2_22:
.Lztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_22
bgt .Lztrmm_kernel_L2_M2_22
ztrmm_kernel_L2_M2_40:
.Lztrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M2_100
ble .Lztrmm_kernel_L2_M2_100
ztrmm_kernel_L2_M2_42:
.Lztrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_42
bgt .Lztrmm_kernel_L2_M2_42
ztrmm_kernel_L2_M2_100:
.Lztrmm_kernel_L2_M2_100:
SAVE2x2
@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
ztrmm_kernel_L2_M2_END:
.Lztrmm_kernel_L2_M2_END:
ztrmm_kernel_L2_M1_BEGIN:
.Lztrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END
ztrmm_kernel_L2_M1_20:
.Lztrmm_kernel_L2_M1_20:
INIT1x2
@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ztrmm_kernel_L2_M1_40
ble .Lztrmm_kernel_L2_M1_40
ztrmm_kernel_L2_M1_22:
.Lztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_22
bgt .Lztrmm_kernel_L2_M1_22
ztrmm_kernel_L2_M1_40:
.Lztrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M1_100
ble .Lztrmm_kernel_L2_M1_100
ztrmm_kernel_L2_M1_42:
.Lztrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_42
bgt .Lztrmm_kernel_L2_M1_42
ztrmm_kernel_L2_M1_100:
.Lztrmm_kernel_L2_M1_100:
SAVE1x2
@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100:
#endif
ztrmm_kernel_L2_END:
.Lztrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END:
/******************************************************************************/
ztrmm_kernel_L1_BEGIN:
.Lztrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble ztrmm_kernel_L999 // done
ble .Lztrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN:
ztrmm_kernel_L1_M4_BEGIN:
.Lztrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ztrmm_kernel_L1_M2_BEGIN
ble .Lztrmm_kernel_L1_M2_BEGIN
ztrmm_kernel_L1_M4_20:
.Lztrmm_kernel_L1_M4_20:
INIT4x1
@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M4_40
ble .Lztrmm_kernel_L1_M4_40
.align 5
ztrmm_kernel_L1_M4_22:
.Lztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_22
bgt .Lztrmm_kernel_L1_M4_22
ztrmm_kernel_L1_M4_40:
.Lztrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M4_100
ble .Lztrmm_kernel_L1_M4_100
ztrmm_kernel_L1_M4_42:
.Lztrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_42
bgt .Lztrmm_kernel_L1_M4_42
ztrmm_kernel_L1_M4_100:
.Lztrmm_kernel_L1_M4_100:
SAVE4x1
@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
ztrmm_kernel_L1_M4_END:
.Lztrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt ztrmm_kernel_L1_M4_20
bgt .Lztrmm_kernel_L1_M4_20
ztrmm_kernel_L1_M2_BEGIN:
.Lztrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L1_M1_BEGIN
ble .Lztrmm_kernel_L1_M1_BEGIN
ztrmm_kernel_L1_M2_20:
.Lztrmm_kernel_L1_M2_20:
INIT2x1
@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M2_40
ble .Lztrmm_kernel_L1_M2_40
ztrmm_kernel_L1_M2_22:
.Lztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_22
bgt .Lztrmm_kernel_L1_M2_22
ztrmm_kernel_L1_M2_40:
.Lztrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M2_100
ble .Lztrmm_kernel_L1_M2_100
ztrmm_kernel_L1_M2_42:
.Lztrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_42
bgt .Lztrmm_kernel_L1_M2_42
ztrmm_kernel_L1_M2_100:
.Lztrmm_kernel_L1_M2_100:
SAVE2x1
@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
ztrmm_kernel_L1_M2_END:
.Lztrmm_kernel_L1_M2_END:
ztrmm_kernel_L1_M1_BEGIN:
.Lztrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END
ztrmm_kernel_L1_M1_20:
.Lztrmm_kernel_L1_M1_20:
INIT1x1
@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M1_40
ble .Lztrmm_kernel_L1_M1_40
ztrmm_kernel_L1_M1_22:
.Lztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_22
bgt .Lztrmm_kernel_L1_M1_22
ztrmm_kernel_L1_M1_40:
.Lztrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M1_100
ble .Lztrmm_kernel_L1_M1_100
ztrmm_kernel_L1_M1_42:
.Lztrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_42
bgt .Lztrmm_kernel_L1_M1_42
ztrmm_kernel_L1_M1_100:
.Lztrmm_kernel_L1_M1_100:
SAVE1x1
ztrmm_kernel_L1_END:
.Lztrmm_kernel_L1_END:
ztrmm_kernel_L999:
.Lztrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]