ARM64: Convert all labels to local labels

While debugging/profiling applications using perf or other tools, the
kernels appear scattered in the profile reports. This is because the labels
within the kernels are not local and each label is shown as a separate
function.

To avoid this, all the labels within the kernels are changed to local
labels.
This commit is contained in:
Ashwin Sekhar T K 2017-10-24 10:47:11 +00:00
parent 627133f9ad
commit a0128aa489
50 changed files with 4469 additions and 4469 deletions

View File

@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero
cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lamax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
.Lamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq amax_kernel_F1
beq .Lamax_kernel_F1
amax_kernel_F4:
.Lamax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne amax_kernel_F4
bne .Lamax_kernel_F4
amax_kernel_F1:
.Lamax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999
amax_kernel_F10:
.Lamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
bne .Lamax_kernel_F10
ret
amax_kernel_F1_INIT:
.Lamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b amax_kernel_F1
b .Lamax_kernel_F1
amax_kernel_S_BEGIN:
.Lamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble amax_kernel_L999
ble .Lamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
ble .Lamax_kernel_S1
amax_kernel_S4:
.Lamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -223,25 +223,25 @@ amax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
bne .Lamax_kernel_S4
amax_kernel_S1:
.Lamax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999
amax_kernel_S10:
.Lamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
bne .Lamax_kernel_S10
amax_kernel_L999:
.Lamax_kernel_L999:
ret
amax_kernel_zero:
.Lamax_kernel_zero:
fmov MAXF, REG0
ret

View File

@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
cmp N, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lasum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
.Lasum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
beq .Lasum_kernel_F1
asum_kernel_F8:
.Lasum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne asum_kernel_F8
bne .Lasum_kernel_F8
KERNEL_F8_FINALIZE
asum_kernel_F1:
.Lasum_kernel_F1:
ands I, N, #7
ble asum_kernel_L999
ble .Lasum_kernel_L999
asum_kernel_F10:
.Lasum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
bne .Lasum_kernel_F10
asum_kernel_L999:
.Lasum_kernel_L999:
ret
asum_kernel_S_BEGIN:
.Lasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lasum_kernel_S1
asum_kernel_S4:
.Lasum_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -175,19 +175,19 @@ asum_kernel_S4:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
bne .Lasum_kernel_S4
asum_kernel_S1:
.Lasum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
ble .Lasum_kernel_L999
asum_kernel_S10:
.Lasum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
bne .Lasum_kernel_S10
ret

View File

@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble axpy_kernel_L999
ble .Laxpy_kernel_L999
fcmp DA, #0.0
beq axpy_kernel_L999
beq .Laxpy_kernel_L999
cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN:
.Laxpy_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq axpy_kernel_F1
beq .Laxpy_kernel_F1
axpy_kernel_F8:
.Laxpy_kernel_F8:
KERNEL_F8
subs I, I, #1
bne axpy_kernel_F8
bne .Laxpy_kernel_F8
axpy_kernel_F1:
.Laxpy_kernel_F1:
ands I, N, #7
ble axpy_kernel_L999
ble .Laxpy_kernel_L999
axpy_kernel_F10:
.Laxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne axpy_kernel_F10
bne .Laxpy_kernel_F10
mov w0, wzr
ret
axpy_kernel_S_BEGIN:
.Laxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1
ble .Laxpy_kernel_S1
axpy_kernel_S4:
.Laxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -189,21 +189,21 @@ axpy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S4
bne .Laxpy_kernel_S4
axpy_kernel_S1:
.Laxpy_kernel_S1:
ands I, N, #3
ble axpy_kernel_L999
ble .Laxpy_kernel_L999
axpy_kernel_S10:
.Laxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S10
bne .Laxpy_kernel_S10
axpy_kernel_L999:
.Laxpy_kernel_L999:
mov w0, wzr
ret

View File

@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov s1, SUMF
cmp N, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lcasum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
.Lcasum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
beq .Lcasum_kernel_F1
asum_kernel_F8:
.Lcasum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne asum_kernel_F8
bne .Lcasum_kernel_F8
KERNEL_F8_FINALIZE
asum_kernel_F1:
.Lcasum_kernel_F1:
ands I, N, #7
ble asum_kernel_L999
ble .Lcasum_kernel_L999
asum_kernel_F10:
.Lcasum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
bne .Lcasum_kernel_F10
asum_kernel_L999:
.Lcasum_kernel_L999:
ret
asum_kernel_S_BEGIN:
.Lcasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lcasum_kernel_S1
asum_kernel_S4:
.Lcasum_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -151,19 +151,19 @@ asum_kernel_S4:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
bne .Lcasum_kernel_S4
asum_kernel_S1:
.Lcasum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
ble .Lcasum_kernel_L999
asum_kernel_S10:
.Lcasum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
bne .Lcasum_kernel_S10
ret

View File

@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
add ppA, temp, pA
cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:
INIT8x4
cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
SAVE8x4
cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_40:
.Lcgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_42:
.Lcgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_42
bgt .Lcgemm_kernel_L4_M4_42
cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:
SAVE4x4
cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:
SAVE2x4
cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:
SAVE1x4
cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5
cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:
SAVE4x2
cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L2_M4_20
bgt .Lcgemm_kernel_L2_M4_20
cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:
SAVE2x2
cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:
SAVE1x2
cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN:
cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5
cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:
SAVE4x1
cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L1_M4_20
bgt .Lcgemm_kernel_L1_M4_20
cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:
SAVE2x1
cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:
SAVE1x1
cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:
cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22
.align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
.align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:
INIT8x4
cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #7
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100
.align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46
cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5
cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:
INIT4x4
cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:
SAVE4x4
cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:
SAVE2x4
cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:
SAVE1x4
cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN
cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:
INIT8x2
@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5
cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22
cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100
cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42
cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:
SAVE8x2
cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20
cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5
cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:
SAVE4x2
cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:
cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:
SAVE2x2
cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:
SAVE1x2
cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN
cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5
cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22
cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100
cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42
cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:
SAVE8x1
cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20
cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5
cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:
SAVE4x1
cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:
cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:
SAVE2x1
cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:
SAVE1x1
cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:
cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #5 // origK / 32
cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x8
subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x16
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22
.align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x8
KERNEL8x4_M1_M2_x4
@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
.align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:
INIT8x4
cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #31
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100
.align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46
cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5
cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:
INIT4x4
cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:
SAVE4x4
cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:
SAVE2x4
cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:
SAVE1x4
cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN
cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:
INIT8x2
@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5
cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22
cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100
cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42
cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:
SAVE8x2
cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20
cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5
cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:
SAVE4x2
cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:
cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:
SAVE2x2
cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:
SAVE1x2
cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN
cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5
cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22
cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100
cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42
cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:
SAVE8x1
cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20
cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5
cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:
SAVE4x1
cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:
cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:
SAVE2x1
cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:
SAVE1x1
cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:
cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble copy_kernel_L999
ble .Lcopy_kernel_L999
cmp INC_X, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN
cmp INC_Y, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN
copy_kernel_F_BEGIN:
.Lcopy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq copy_kernel_F1
beq .Lcopy_kernel_F1
copy_kernel_F4:
.Lcopy_kernel_F4:
KERNEL_F4
subs I, I, #1
bne copy_kernel_F4
bne .Lcopy_kernel_F4
copy_kernel_F1:
.Lcopy_kernel_F1:
ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999
copy_kernel_F10:
.Lcopy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne copy_kernel_F10
bne .Lcopy_kernel_F10
mov w0, wzr
ret
copy_kernel_S_BEGIN:
.Lcopy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble copy_kernel_S1
ble .Lcopy_kernel_S1
copy_kernel_S4:
.Lcopy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -210,21 +210,21 @@ copy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne copy_kernel_S4
bne .Lcopy_kernel_S4
copy_kernel_S1:
.Lcopy_kernel_S1:
ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999
copy_kernel_S10:
.Lcopy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne copy_kernel_S10
bne .Lcopy_kernel_S10
copy_kernel_L999:
.Lcopy_kernel_L999:
mov w0, wzr
ret

View File

@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/
ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN
ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5
ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22
ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32:
.Lctrmm_kernel_L4_M4_32:
tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
.Lctrmm_kernel_L4_M4_40:
INIT4x4
ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100
ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:
SAVE4x4
@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ctrmm_kernel_L4_M4_20
bne .Lctrmm_kernel_L4_M4_20
ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN
ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:
INIT2x4
@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40
ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22
ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100
ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42
ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:
SAVE2x4
@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:
ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:
INIT1x4
@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40
ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22
ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100
ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42
ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:
SAVE1x4
@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/
ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN
ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:
INIT4x2
@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5
ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22
ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100
ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42
ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:
SAVE4x2
@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M4_20
bgt .Lctrmm_kernel_L2_M4_20
ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN
ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:
INIT2x2
@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40
ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22
ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100
ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42
ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:
SAVE2x2
@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:
ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:
INIT1x2
@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40
ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22
ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100
ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42
ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:
SAVE1x2
@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END:
/******************************************************************************/
ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN
ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:
INIT4x1
@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5
ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22
ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100
ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42
ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:
SAVE4x1
@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M4_20
bgt .Lctrmm_kernel_L1_M4_20
ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN
ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:
INIT2x1
@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40
ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22
ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100
ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42
ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:
SAVE2x1
@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:
ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:
INIT1x1
@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40
ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22
ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100
ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42
ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:
SAVE1x1
ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:
ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/
ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
ctrmm_kernel_L4_M8_BEGIN:
.Lctrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L4_M4_BEGIN
ble .Lctrmm_kernel_L4_M4_BEGIN
ctrmm_kernel_L4_M8_20:
.Lctrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20:
asr counterL , tempK, #3
cmp counterL , #2
blt ctrmm_kernel_L4_M8_32
blt .Lctrmm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble ctrmm_kernel_L4_M8_22a
ble .Lctrmm_kernel_L4_M8_22a
.align 5
ctrmm_kernel_L4_M8_22:
.Lctrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M8_22
bgt .Lctrmm_kernel_L4_M8_22
.align 5
ctrmm_kernel_L4_M8_22a:
.Lctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44
.align 5
ctrmm_kernel_L4_M8_32:
.Lctrmm_kernel_L4_M8_32:
tst counterL, #1
ble ctrmm_kernel_L4_M8_40
ble .Lctrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44
ctrmm_kernel_L4_M8_40:
.Lctrmm_kernel_L4_M8_40:
INIT8x4
ctrmm_kernel_L4_M8_44:
.Lctrmm_kernel_L4_M8_44:
ands counterL , tempK, #7
ble ctrmm_kernel_L4_M8_100
ble .Lctrmm_kernel_L4_M8_100
.align 5
ctrmm_kernel_L4_M8_46:
.Lctrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne ctrmm_kernel_L4_M8_46
bne .Lctrmm_kernel_L4_M8_46
ctrmm_kernel_L4_M8_100:
.Lctrmm_kernel_L4_M8_100:
SAVE8x4
@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
ctrmm_kernel_L4_M8_END:
.Lctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne ctrmm_kernel_L4_M8_20
bne .Lctrmm_kernel_L4_M8_20
ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
tst counterI, #4
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN
ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5
ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22
ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_32:
tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_40:
INIT4x4
ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100
ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:
SAVE4x4
@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:
ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN
ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:
INIT2x4
@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40
ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22
ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100
ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42
ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:
SAVE2x4
@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:
ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END
ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:
INIT1x4
@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40
ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22
ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100
ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42
ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:
SAVE1x4
@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/
ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif
mov pA, origPA // pA = A
ctrmm_kernel_L2_M8_BEGIN:
.Lctrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L2_M4_BEGIN
ble .Lctrmm_kernel_L2_M4_BEGIN
ctrmm_kernel_L2_M8_20:
.Lctrmm_kernel_L2_M8_20:
INIT8x2
@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M8_40
ble .Lctrmm_kernel_L2_M8_40
.align 5
ctrmm_kernel_L2_M8_22:
.Lctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_22
bgt .Lctrmm_kernel_L2_M8_22
ctrmm_kernel_L2_M8_40:
.Lctrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M8_100
ble .Lctrmm_kernel_L2_M8_100
ctrmm_kernel_L2_M8_42:
.Lctrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_42
bgt .Lctrmm_kernel_L2_M8_42
ctrmm_kernel_L2_M8_100:
.Lctrmm_kernel_L2_M8_100:
SAVE8x2
@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8
#endif
ctrmm_kernel_L2_M8_END:
.Lctrmm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M8_20
bgt .Lctrmm_kernel_L2_M8_20
ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN
ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:
INIT4x2
@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5
ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22
ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100
ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42
ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:
SAVE4x2
@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:
ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN
ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:
INIT2x2
@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40
ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22
ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100
ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42
ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:
SAVE2x2
@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:
ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END
ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:
INIT1x2
@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40
ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22
ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100
ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42
ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:
SAVE1x2
@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END:
/******************************************************************************/
ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A
ctrmm_kernel_L1_M8_BEGIN:
.Lctrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L1_M4_BEGIN
ble .Lctrmm_kernel_L1_M4_BEGIN
ctrmm_kernel_L1_M8_20:
.Lctrmm_kernel_L1_M8_20:
INIT8x1
@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M8_40
ble .Lctrmm_kernel_L1_M8_40
.align 5
ctrmm_kernel_L1_M8_22:
.Lctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_22
bgt .Lctrmm_kernel_L1_M8_22
ctrmm_kernel_L1_M8_40:
.Lctrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M8_100
ble .Lctrmm_kernel_L1_M8_100
ctrmm_kernel_L1_M8_42:
.Lctrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_42
bgt .Lctrmm_kernel_L1_M8_42
ctrmm_kernel_L1_M8_100:
.Lctrmm_kernel_L1_M8_100:
SAVE8x1
@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8
#endif
ctrmm_kernel_L1_M8_END:
.Lctrmm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M8_20
bgt .Lctrmm_kernel_L1_M8_20
ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN
ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:
INIT4x1
@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5
ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22
ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100
ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42
ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:
SAVE4x1
@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:
ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN
ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:
INIT2x1
@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40
ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22
ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100
ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42
ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:
SAVE2x1
@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:
ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END
ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:
INIT1x1
@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40
ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22
ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100
ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42
ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:
SAVE1x1
ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:
ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999
fcmp DA, #0.0
beq axpy_kernel_L999
beq .Ldaxpy_kernel_L999
cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN:
.Ldaxpy_kernel_F_BEGIN:
asr I, N, #5
cmp I, xzr
beq axpy_kernel_F1
beq .Ldaxpy_kernel_F1
.align 5
axpy_kernel_F32:
.Ldaxpy_kernel_F32:
KERNEL_F32
subs I, I, #1
bne axpy_kernel_F32
bne .Ldaxpy_kernel_F32
axpy_kernel_F1:
.Ldaxpy_kernel_F1:
ands I, N, #31
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999
axpy_kernel_F10:
.Ldaxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne axpy_kernel_F10
bne .Ldaxpy_kernel_F10
b axpy_kernel_L999
b .Ldaxpy_kernel_L999
axpy_kernel_S_BEGIN:
.Ldaxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1
ble .Ldaxpy_kernel_S1
axpy_kernel_S4:
.Ldaxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -176,21 +176,21 @@ axpy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S4
bne .Ldaxpy_kernel_S4
axpy_kernel_S1:
.Ldaxpy_kernel_S1:
ands I, N, #3
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999
axpy_kernel_S10:
.Ldaxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S10
bne .Ldaxpy_kernel_S10
axpy_kernel_L999:
.Ldaxpy_kernel_L999:
mov w0, wzr
ret

View File

@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:
//------------------------------------------------------------------------------
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #2 // L = K / 4
cmp counterL , #2
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp]
@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble dgemm_kernel_L4_BEGIN
ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L8_BEGIN:
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
mov pA, origPA // pA = start of A array
dgemm_kernel_L8_M4_BEGIN:
.Ldgemm_kernel_L8_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L8_M2_BEGIN
ble .Ldgemm_kernel_L8_M2_BEGIN
dgemm_kernel_L8_M4_20:
.Ldgemm_kernel_L8_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L8_M4_32
blt .Ldgemm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
ble dgemm_kernel_L8_M4_22a
ble .Ldgemm_kernel_L8_M4_22a
.align 5
dgemm_kernel_L8_M4_22:
.Ldgemm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M4_22
bgt .Ldgemm_kernel_L8_M4_22
dgemm_kernel_L8_M4_22a:
.Ldgemm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44
dgemm_kernel_L8_M4_32:
.Ldgemm_kernel_L8_M4_32:
tst counterL, #1
ble dgemm_kernel_L8_M4_40
ble .Ldgemm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44
dgemm_kernel_L8_M4_40:
.Ldgemm_kernel_L8_M4_40:
INIT4x8
dgemm_kernel_L8_M4_44:
.Ldgemm_kernel_L8_M4_44:
ands counterL , origK, #1
ble dgemm_kernel_L8_M4_100
ble .Ldgemm_kernel_L8_M4_100
dgemm_kernel_L8_M4_46:
.Ldgemm_kernel_L8_M4_46:
KERNEL4x8_SUB
dgemm_kernel_L8_M4_100:
.Ldgemm_kernel_L8_M4_100:
SAVE4x8
dgemm_kernel_L8_M4_END:
.Ldgemm_kernel_L8_M4_END:
subs counterI, counterI, #1
bne dgemm_kernel_L8_M4_20
bne .Ldgemm_kernel_L8_M4_20
dgemm_kernel_L8_M2_BEGIN:
.Ldgemm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L8_M1_BEGIN
ble .Ldgemm_kernel_L8_M1_BEGIN
dgemm_kernel_L8_M2_20:
.Ldgemm_kernel_L8_M2_20:
INIT2x8
@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L8_M2_40
ble .Ldgemm_kernel_L8_M2_40
dgemm_kernel_L8_M2_22:
.Ldgemm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_22
bgt .Ldgemm_kernel_L8_M2_22
dgemm_kernel_L8_M2_40:
.Ldgemm_kernel_L8_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M2_100
ble .Ldgemm_kernel_L8_M2_100
dgemm_kernel_L8_M2_42:
.Ldgemm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_42
bgt .Ldgemm_kernel_L8_M2_42
dgemm_kernel_L8_M2_100:
.Ldgemm_kernel_L8_M2_100:
SAVE2x8
dgemm_kernel_L8_M2_END:
.Ldgemm_kernel_L8_M2_END:
dgemm_kernel_L8_M1_BEGIN:
.Ldgemm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END
dgemm_kernel_L8_M1_20:
.Ldgemm_kernel_L8_M1_20:
INIT1x8
@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L8_M1_40
ble .Ldgemm_kernel_L8_M1_40
dgemm_kernel_L8_M1_22:
.Ldgemm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_22
bgt .Ldgemm_kernel_L8_M1_22
dgemm_kernel_L8_M1_40:
.Ldgemm_kernel_L8_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M1_100
ble .Ldgemm_kernel_L8_M1_100
dgemm_kernel_L8_M1_42:
.Ldgemm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_42
bgt .Ldgemm_kernel_L8_M1_42
dgemm_kernel_L8_M1_100:
.Ldgemm_kernel_L8_M1_100:
SAVE1x8
dgemm_kernel_L8_END:
.Ldgemm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L8_BEGIN
bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
ble dgemm_kernel_L999
ble .Ldgemm_kernel_L999
tst counterJ , #4
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M4_32
blt .Ldgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble dgemm_kernel_L4_M4_22a
ble .Ldgemm_kernel_L4_M4_22a
.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_22a:
.Ldgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_32:
.Ldgemm_kernel_L4_M4_32:
tst counterL, #1
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
INIT4x4
dgemm_kernel_L4_M4_44:
.Ldgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_46:
.Ldgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M4_20
bne .Ldgemm_kernel_L4_M4_20
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/
.align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
.align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
.align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pCRow1, pCRow0, LDC
@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:
INIT8x2
@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40
.align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42
dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:
SAVE8x2
dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A
dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40
.align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22
dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42
dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:
SAVE8x1
dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/
.align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #7 // L = K / 128
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x1
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x64
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x32
KERNEL8x4_M1_M2_x16
@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #127
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100:
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
.align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
.align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x4_SUB
@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pCRow1, pCRow0, LDC
@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:
INIT8x2
@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40
.align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42
dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:
SAVE8x2
dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x2_SUB
@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A
dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40
.align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22
dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42
dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:
SAVE8x1
dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x1_SUB
@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, LDA, #3 // LDA = LDA * SIZE
dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:
asr J, N, #2 // J = N / 4
cmp J, #0
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN
.align 5
dgemm_ncopy_L4_M4_BEGIN:
.Ldgemm_ncopy_L4_M4_BEGIN:
mov A01, A00
add A02, A01, LDA
@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN:
asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L4_M4_40
ble .Ldgemm_ncopy_L4_M4_40
.align 5
dgemm_ncopy_L4_M4_20:
.Ldgemm_ncopy_L4_M4_20:
COPY4x4
subs I , I , #1
bne dgemm_ncopy_L4_M4_20
bne .Ldgemm_ncopy_L4_M4_20
dgemm_ncopy_L4_M4_40:
.Ldgemm_ncopy_L4_M4_40:
and I, M , #3
cmp I, #0
ble dgemm_ncopy_L4_M4_END
ble .Ldgemm_ncopy_L4_M4_END
.align 5
dgemm_ncopy_L4_M4_60:
.Ldgemm_ncopy_L4_M4_60:
COPY1x4
subs I , I , #1
bne dgemm_ncopy_L4_M4_60
bne .Ldgemm_ncopy_L4_M4_60
dgemm_ncopy_L4_M4_END:
.Ldgemm_ncopy_L4_M4_END:
subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN
bne .Ldgemm_ncopy_L4_M4_BEGIN
/*********************************************************************************************/
dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:
tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M4_BEGIN:
.Ldgemm_ncopy_L2_M4_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA
asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L2_M4_40
ble .Ldgemm_ncopy_L2_M4_40
.align 5
dgemm_ncopy_L2_M4_20:
.Ldgemm_ncopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne dgemm_ncopy_L2_M4_20
bne .Ldgemm_ncopy_L2_M4_20
dgemm_ncopy_L2_M4_40:
.Ldgemm_ncopy_L2_M4_40:
and I, M , #3
cmp I, #0
ble dgemm_ncopy_L2_M4_END
ble .Ldgemm_ncopy_L2_M4_END
.align 5
dgemm_ncopy_L2_M4_60:
.Ldgemm_ncopy_L2_M4_60:
COPY1x2
subs I , I , #1
bne dgemm_ncopy_L2_M4_60
bne .Ldgemm_ncopy_L2_M4_60
dgemm_ncopy_L2_M4_END:
.Ldgemm_ncopy_L2_M4_END:
/*********************************************************************************************/
dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:
tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
dgemm_ncopy_L1_M4_BEGIN:
.Ldgemm_ncopy_L1_M4_BEGIN:
mov A01, A00
asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L1_M4_40
ble .Ldgemm_ncopy_L1_M4_40
.align 5
dgemm_ncopy_L1_M4_20:
.Ldgemm_ncopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne dgemm_ncopy_L1_M4_20
bne .Ldgemm_ncopy_L1_M4_20
dgemm_ncopy_L1_M4_40:
.Ldgemm_ncopy_L1_M4_40:
and I, M , #3
cmp I, #0
ble dgemm_ncopy_L1_M4_END
ble .Ldgemm_ncopy_L1_M4_END
.align 5
dgemm_ncopy_L1_M4_60:
.Ldgemm_ncopy_L1_M4_60:
COPY1x1
subs I , I , #1
bne dgemm_ncopy_L1_M4_60
bne .Ldgemm_ncopy_L1_M4_60
dgemm_ncopy_L1_M4_END:
.Ldgemm_ncopy_L1_M4_END:
dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:
mov x0, #0
RESTORE_REGS

View File

@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, LDA, #3 // LDA = LDA * SIZE
dgemm_ncopy_L8_BEGIN:
.Ldgemm_ncopy_L8_BEGIN:
asr J, N, #3 // J = N / 8
cmp J, #0
ble dgemm_ncopy_L4_BEGIN
ble .Ldgemm_ncopy_L4_BEGIN
dgemm_ncopy_L8_M8_BEGIN:
.Ldgemm_ncopy_L8_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN:
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L8_M8_40
ble .Ldgemm_ncopy_L8_M8_40
dgemm_ncopy_L8_M8_20:
.Ldgemm_ncopy_L8_M8_20:
COPY8x8
subs I , I , #1
bne dgemm_ncopy_L8_M8_20
bne .Ldgemm_ncopy_L8_M8_20
dgemm_ncopy_L8_M8_40:
.Ldgemm_ncopy_L8_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L8_M8_END
ble .Ldgemm_ncopy_L8_M8_END
dgemm_ncopy_L8_M8_60:
.Ldgemm_ncopy_L8_M8_60:
COPY1x8
subs I , I , #1
bne dgemm_ncopy_L8_M8_60
bne .Ldgemm_ncopy_L8_M8_60
dgemm_ncopy_L8_M8_END:
.Ldgemm_ncopy_L8_M8_END:
subs J , J, #1 // j--
bne dgemm_ncopy_L8_M8_BEGIN
bne .Ldgemm_ncopy_L8_M8_BEGIN
/*********************************************************************************************/
dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:
tst N, #7
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
tst N, #4
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN
dgemm_ncopy_L4_M8_BEGIN:
.Ldgemm_ncopy_L4_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN:
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L4_M8_40
ble .Ldgemm_ncopy_L4_M8_40
dgemm_ncopy_L4_M8_20:
.Ldgemm_ncopy_L4_M8_20:
COPY8x4
subs I , I , #1
bne dgemm_ncopy_L4_M8_20
bne .Ldgemm_ncopy_L4_M8_20
dgemm_ncopy_L4_M8_40:
.Ldgemm_ncopy_L4_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L4_M8_END
ble .Ldgemm_ncopy_L4_M8_END
dgemm_ncopy_L4_M8_60:
.Ldgemm_ncopy_L4_M8_60:
COPY1x4
subs I , I , #1
bne dgemm_ncopy_L4_M8_60
bne .Ldgemm_ncopy_L4_M8_60
dgemm_ncopy_L4_M8_END:
.Ldgemm_ncopy_L4_M8_END:
/*********************************************************************************************/
dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:
tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M8_BEGIN:
.Ldgemm_ncopy_L2_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L2_M8_40
ble .Ldgemm_ncopy_L2_M8_40
dgemm_ncopy_L2_M8_20:
.Ldgemm_ncopy_L2_M8_20:
COPY8x2
subs I , I , #1
bne dgemm_ncopy_L2_M8_20
bne .Ldgemm_ncopy_L2_M8_20
dgemm_ncopy_L2_M8_40:
.Ldgemm_ncopy_L2_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L2_M8_END
ble .Ldgemm_ncopy_L2_M8_END
dgemm_ncopy_L2_M8_60:
.Ldgemm_ncopy_L2_M8_60:
COPY1x2
subs I , I , #1
bne dgemm_ncopy_L2_M8_60
bne .Ldgemm_ncopy_L2_M8_60
dgemm_ncopy_L2_M8_END:
.Ldgemm_ncopy_L2_M8_END:
/*********************************************************************************************/
dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:
tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999
dgemm_ncopy_L1_M8_BEGIN:
.Ldgemm_ncopy_L1_M8_BEGIN:
mov A01, A00
asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L1_M8_40
ble .Ldgemm_ncopy_L1_M8_40
dgemm_ncopy_L1_M8_20:
.Ldgemm_ncopy_L1_M8_20:
COPY8x1
subs I , I , #1
bne dgemm_ncopy_L1_M8_20
bne .Ldgemm_ncopy_L1_M8_20
dgemm_ncopy_L1_M8_40:
.Ldgemm_ncopy_L1_M8_40:
and I, M , #7
cmp I, #0
ble dgemm_ncopy_L1_M8_END
ble .Ldgemm_ncopy_L1_M8_END
dgemm_ncopy_L1_M8_60:
.Ldgemm_ncopy_L1_M8_60:
COPY1x1
subs I , I , #1
bne dgemm_ncopy_L1_M8_60
bne .Ldgemm_ncopy_L1_M8_60
dgemm_ncopy_L1_M8_END:
.Ldgemm_ncopy_L1_M8_END:
dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:
mov x0, #0
RESTORE_REGS

View File

@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl M4, M, #5 // M4 = M * 4 * SIZE
dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
asr J, M, #2 // J = M / 4
cmp J, #0
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN
.align 5
dgemm_tcopy_L4_M4_BEGIN:
.Ldgemm_tcopy_L4_M4_BEGIN:
mov A01, A
add A02, A01, LDA
@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN:
asr I, N, #2 // I = N / 4
cmp I, #0
ble dgemm_tcopy_L4_M4_40
ble .Ldgemm_tcopy_L4_M4_40
.align 5
dgemm_tcopy_L4_M4_20:
.Ldgemm_tcopy_L4_M4_20:
COPY4x4
subs I , I , #1
bne dgemm_tcopy_L4_M4_20
bne .Ldgemm_tcopy_L4_M4_20
dgemm_tcopy_L4_M4_40:
.Ldgemm_tcopy_L4_M4_40:
tst N , #2
ble dgemm_tcopy_L4_M4_60
ble .Ldgemm_tcopy_L4_M4_60
COPY2x4
dgemm_tcopy_L4_M4_60:
.Ldgemm_tcopy_L4_M4_60:
tst N, #1
ble dgemm_tcopy_L4_M4_END
ble .Ldgemm_tcopy_L4_M4_END
COPY1x4
dgemm_tcopy_L4_M4_END:
.Ldgemm_tcopy_L4_M4_END:
subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN
bne .Ldgemm_tcopy_L4_M4_BEGIN
/*********************************************************************************************/
dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:
tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN
dgemm_tcopy_L2_M4_BEGIN:
.Ldgemm_tcopy_L2_M4_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN:
asr I, N, #2 // I = N / 4
cmp I, #0
ble dgemm_tcopy_L2_M4_40
ble .Ldgemm_tcopy_L2_M4_40
.align 5
dgemm_tcopy_L2_M4_20:
.Ldgemm_tcopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne dgemm_tcopy_L2_M4_20
bne .Ldgemm_tcopy_L2_M4_20
dgemm_tcopy_L2_M4_40:
.Ldgemm_tcopy_L2_M4_40:
tst N , #2
ble dgemm_tcopy_L2_M4_60
ble .Ldgemm_tcopy_L2_M4_60
COPY2x2
dgemm_tcopy_L2_M4_60:
.Ldgemm_tcopy_L2_M4_60:
tst N , #1
ble dgemm_tcopy_L2_M4_END
ble .Ldgemm_tcopy_L2_M4_END
COPY1x2
dgemm_tcopy_L2_M4_END:
.Ldgemm_tcopy_L2_M4_END:
/*********************************************************************************************/
dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:
tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
dgemm_tcopy_L1_M4_BEGIN:
.Ldgemm_tcopy_L1_M4_BEGIN:
mov A01, A // A01 = A
mov B01, B
asr I, N, #2 // I = M / 4
cmp I, #0
ble dgemm_tcopy_L1_M4_40
ble .Ldgemm_tcopy_L1_M4_40
.align 5
dgemm_tcopy_L1_M4_20:
.Ldgemm_tcopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne dgemm_tcopy_L1_M4_20
bne .Ldgemm_tcopy_L1_M4_20
dgemm_tcopy_L1_M4_40:
.Ldgemm_tcopy_L1_M4_40:
tst N , #2
ble dgemm_tcopy_L1_M4_60
ble .Ldgemm_tcopy_L1_M4_60
COPY2x1
dgemm_tcopy_L1_M4_60:
.Ldgemm_tcopy_L1_M4_60:
tst N , #1
ble dgemm_tcopy_L1_M4_END
ble .Ldgemm_tcopy_L1_M4_END
COPY1x1
dgemm_tcopy_L1_M4_END:
.Ldgemm_tcopy_L1_M4_END:
dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret

View File

@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl M8, M, #6 // M8 = M * 8 * SIZE
dgemm_tcopy_L8_BEGIN:
.Ldgemm_tcopy_L8_BEGIN:
asr J, M, #3 // J = M / 4
cmp J, #0
ble dgemm_tcopy_L4_BEGIN
ble .Ldgemm_tcopy_L4_BEGIN
.align 5
dgemm_tcopy_L8_M8_BEGIN:
.Ldgemm_tcopy_L8_M8_BEGIN:
mov A01, A
add A02, A01, LDA
@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L8_M8_40
ble .Ldgemm_tcopy_L8_M8_40
.align 5
dgemm_tcopy_L8_M8_20:
.Ldgemm_tcopy_L8_M8_20:
COPY8x8
subs I , I , #1
bne dgemm_tcopy_L8_M8_20
bne .Ldgemm_tcopy_L8_M8_20
dgemm_tcopy_L8_M8_40:
.Ldgemm_tcopy_L8_M8_40:
tst N , #4
ble dgemm_tcopy_L8_M8_60
ble .Ldgemm_tcopy_L8_M8_60
COPY4x8
dgemm_tcopy_L8_M8_60:
.Ldgemm_tcopy_L8_M8_60:
tst N , #2
ble dgemm_tcopy_L8_M8_80
ble .Ldgemm_tcopy_L8_M8_80
COPY2x8
dgemm_tcopy_L8_M8_80:
.Ldgemm_tcopy_L8_M8_80:
tst N, #1
ble dgemm_tcopy_L8_M8_END
ble .Ldgemm_tcopy_L8_M8_END
COPY1x8
dgemm_tcopy_L8_M8_END:
.Ldgemm_tcopy_L8_M8_END:
subs J , J, #1 // j--
bne dgemm_tcopy_L8_M8_BEGIN
bne .Ldgemm_tcopy_L8_M8_BEGIN
/*********************************************************************************************/
dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
tst M, #7
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
tst M, #4
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN
dgemm_tcopy_L4_M8_BEGIN:
.Ldgemm_tcopy_L4_M8_BEGIN:
mov A01, A
add A02, A01, LDA
@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L4_M8_40
ble .Ldgemm_tcopy_L4_M8_40
.align 5
dgemm_tcopy_L4_M8_20:
.Ldgemm_tcopy_L4_M8_20:
COPY8x4
subs I , I , #1
bne dgemm_tcopy_L4_M8_20
bne .Ldgemm_tcopy_L4_M8_20
dgemm_tcopy_L4_M8_40:
.Ldgemm_tcopy_L4_M8_40:
tst N , #4
ble dgemm_tcopy_L4_M8_60
ble .Ldgemm_tcopy_L4_M8_60
COPY4x4
dgemm_tcopy_L4_M8_60:
.Ldgemm_tcopy_L4_M8_60:
tst N , #2
ble dgemm_tcopy_L4_M8_80
ble .Ldgemm_tcopy_L4_M8_80
COPY2x4
dgemm_tcopy_L4_M8_80:
.Ldgemm_tcopy_L4_M8_80:
tst N, #1
ble dgemm_tcopy_L4_M8_END
ble .Ldgemm_tcopy_L4_M8_END
COPY1x4
dgemm_tcopy_L4_M8_END:
.Ldgemm_tcopy_L4_M8_END:
/*********************************************************************************************/
dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:
tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN
dgemm_tcopy_L2_M8_BEGIN:
.Ldgemm_tcopy_L2_M8_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L2_M8_40
ble .Ldgemm_tcopy_L2_M8_40
.align 5
dgemm_tcopy_L2_M8_20:
.Ldgemm_tcopy_L2_M8_20:
COPY8x2
subs I , I , #1
bne dgemm_tcopy_L2_M8_20
bne .Ldgemm_tcopy_L2_M8_20
dgemm_tcopy_L2_M8_40:
.Ldgemm_tcopy_L2_M8_40:
tst N , #4
ble dgemm_tcopy_L2_M8_60
ble .Ldgemm_tcopy_L2_M8_60
COPY4x2
dgemm_tcopy_L2_M8_60:
.Ldgemm_tcopy_L2_M8_60:
tst N , #2
ble dgemm_tcopy_L2_M8_80
ble .Ldgemm_tcopy_L2_M8_80
COPY2x2
dgemm_tcopy_L2_M8_80:
.Ldgemm_tcopy_L2_M8_80:
tst N , #1
ble dgemm_tcopy_L2_M8_END
ble .Ldgemm_tcopy_L2_M8_END
COPY1x2
dgemm_tcopy_L2_M8_END:
.Ldgemm_tcopy_L2_M8_END:
/*********************************************************************************************/
dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:
tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999
dgemm_tcopy_L1_M8_BEGIN:
.Ldgemm_tcopy_L1_M8_BEGIN:
mov A01, A // A01 = A
mov B01, B
asr I, N, #3 // I = M / 8
cmp I, #0
ble dgemm_tcopy_L1_M8_40
ble .Ldgemm_tcopy_L1_M8_40
.align 5
dgemm_tcopy_L1_M8_20:
.Ldgemm_tcopy_L1_M8_20:
COPY8x1
subs I , I , #1
bne dgemm_tcopy_L1_M8_20
bne .Ldgemm_tcopy_L1_M8_20
dgemm_tcopy_L1_M8_40:
.Ldgemm_tcopy_L1_M8_40:
tst N , #4
ble dgemm_tcopy_L1_M8_60
ble .Ldgemm_tcopy_L1_M8_60
COPY4x1
dgemm_tcopy_L1_M8_60:
.Ldgemm_tcopy_L1_M8_60:
tst N , #2
ble dgemm_tcopy_L1_M8_80
ble .Ldgemm_tcopy_L1_M8_80
COPY2x1
dgemm_tcopy_L1_M8_80:
.Ldgemm_tcopy_L1_M8_80:
tst N , #1
ble dgemm_tcopy_L1_M8_END
ble .Ldgemm_tcopy_L1_M8_END
COPY1x1
dgemm_tcopy_L1_M8_END:
.Ldgemm_tcopy_L1_M8_END:
dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret

View File

@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
cmp N, xzr
ble dot_kernel_L999
ble .Ldot_kernel_L999
cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN
dot_kernel_F_BEGIN:
.Ldot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
beq .Ldot_kernel_F1
dot_kernel_F4:
.Ldot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne dot_kernel_F4
bne .Ldot_kernel_F4
KERNEL_F4_FINALIZE
dot_kernel_F1:
.Ldot_kernel_F1:
ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999
dot_kernel_F10:
.Ldot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne dot_kernel_F10
bne .Ldot_kernel_F10
ret
dot_kernel_S_BEGIN:
.Ldot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
ble .Ldot_kernel_S1
dot_kernel_S4:
.Ldot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -206,21 +206,21 @@ dot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S4
bne .Ldot_kernel_S4
dot_kernel_S1:
.Ldot_kernel_S1:
ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999
dot_kernel_S10:
.Ldot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S10
bne .Ldot_kernel_S10
dot_kernel_L999:
.Ldot_kernel_L999:
ret

View File

@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5
dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:
tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:
INIT4x4
dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20
dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:
INIT2x4
@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:
INIT1x4
@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -838,19 +838,19 @@ dtrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:
INIT4x2
@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5
dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20
dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:
INIT2x2
@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:
INIT1x2
@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/
dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:
INIT4x1
@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5
dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20
dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:
INIT2x1
@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:
INIT1x1
@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble dtrmm_kernel_L4_BEGIN
ble .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
dtrmm_kernel_L8_BEGIN:
.Ldtrmm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN:
mov pA, origPA // pA = start of A array
dtrmm_kernel_L8_M4_BEGIN:
.Ldtrmm_kernel_L8_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L8_M2_BEGIN
ble .Ldtrmm_kernel_L8_M2_BEGIN
dtrmm_kernel_L8_M4_20:
.Ldtrmm_kernel_L8_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20:
asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L8_M4_32
blt .Ldtrmm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
ble dtrmm_kernel_L8_M4_22a
ble .Ldtrmm_kernel_L8_M4_22a
.align 5
dtrmm_kernel_L8_M4_22:
.Ldtrmm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M4_22
bgt .Ldtrmm_kernel_L8_M4_22
dtrmm_kernel_L8_M4_22a:
.Ldtrmm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44
dtrmm_kernel_L8_M4_32:
.Ldtrmm_kernel_L8_M4_32:
tst counterL, #1
ble dtrmm_kernel_L8_M4_40
ble .Ldtrmm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44
dtrmm_kernel_L8_M4_40:
.Ldtrmm_kernel_L8_M4_40:
INIT4x8
dtrmm_kernel_L8_M4_44:
.Ldtrmm_kernel_L8_M4_44:
ands counterL, tempK, #1
ble dtrmm_kernel_L8_M4_100
ble .Ldtrmm_kernel_L8_M4_100
dtrmm_kernel_L8_M4_46:
.Ldtrmm_kernel_L8_M4_46:
KERNEL4x8_SUB
dtrmm_kernel_L8_M4_100:
.Ldtrmm_kernel_L8_M4_100:
SAVE4x8
@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L8_M4_END:
.Ldtrmm_kernel_L8_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L8_M4_20
bne .Ldtrmm_kernel_L8_M4_20
dtrmm_kernel_L8_M2_BEGIN:
.Ldtrmm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L8_M1_BEGIN
ble .Ldtrmm_kernel_L8_M1_BEGIN
dtrmm_kernel_L8_M2_20:
.Ldtrmm_kernel_L8_M2_20:
INIT2x8
@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20:
asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L8_M2_40
ble .Ldtrmm_kernel_L8_M2_40
dtrmm_kernel_L8_M2_22:
.Ldtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_22
bgt .Ldtrmm_kernel_L8_M2_22
dtrmm_kernel_L8_M2_40:
.Ldtrmm_kernel_L8_M2_40:
ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M2_100
ble .Ldtrmm_kernel_L8_M2_100
dtrmm_kernel_L8_M2_42:
.Ldtrmm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_42
bgt .Ldtrmm_kernel_L8_M2_42
dtrmm_kernel_L8_M2_100:
.Ldtrmm_kernel_L8_M2_100:
SAVE2x8
@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L8_M2_END:
.Ldtrmm_kernel_L8_M2_END:
dtrmm_kernel_L8_M1_BEGIN:
.Ldtrmm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END
dtrmm_kernel_L8_M1_20:
.Ldtrmm_kernel_L8_M1_20:
INIT1x8
@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20:
asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L8_M1_40
ble .Ldtrmm_kernel_L8_M1_40
dtrmm_kernel_L8_M1_22:
.Ldtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_22
bgt .Ldtrmm_kernel_L8_M1_22
dtrmm_kernel_L8_M1_40:
.Ldtrmm_kernel_L8_M1_40:
ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M1_100
ble .Ldtrmm_kernel_L8_M1_100
dtrmm_kernel_L8_M1_42:
.Ldtrmm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_42
bgt .Ldtrmm_kernel_L8_M1_42
dtrmm_kernel_L8_M1_100:
.Ldtrmm_kernel_L8_M1_100:
SAVE1x8
@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L8_END:
.Ldtrmm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L8_BEGIN
bgt .Ldtrmm_kernel_L8_BEGIN
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
ble dtrmm_kernel_L999
ble .Ldtrmm_kernel_L999
tst counterJ , #4
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20:
asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5
dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:
tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:
INIT4x4
dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20
dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:
INIT2x4
@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:
INIT1x4
@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100:
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END:
/******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:
INIT4x2
@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5
dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20
dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:
INIT2x2
@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:
INIT1x2
@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100:
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/
dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A
dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:
INIT4x1
@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5
dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20
dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:
INIT2x1
@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:
INIT1x1
@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/
dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M8_BEGIN:
.Ldtrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
ble .Ldtrmm_kernel_L4_M4_BEGIN
.align 5
dtrmm_kernel_L4_M8_20:
.Ldtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20:
asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
blt .Ldtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
ble .Ldtrmm_kernel_L4_M8_22a
.align 5
dtrmm_kernel_L4_M8_22:
.Ldtrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
bgt .Ldtrmm_kernel_L4_M8_22
.align 5
dtrmm_kernel_L4_M8_22a:
.Ldtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44
.align 5
dtrmm_kernel_L4_M8_32:
.Ldtrmm_kernel_L4_M8_32:
tst counterL, #1
ble dtrmm_kernel_L4_M8_40
ble .Ldtrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44
dtrmm_kernel_L4_M8_40:
.Ldtrmm_kernel_L4_M8_40:
INIT8x4
dtrmm_kernel_L4_M8_44:
.Ldtrmm_kernel_L4_M8_44:
ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
ble .Ldtrmm_kernel_L4_M8_100
.align 5
dtrmm_kernel_L4_M8_46:
.Ldtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
bne .Ldtrmm_kernel_L4_M8_46
dtrmm_kernel_L4_M8_100:
.Ldtrmm_kernel_L4_M8_100:
SAVE8x4
@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
dtrmm_kernel_L4_M8_END:
.Ldtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M8_20
bne .Ldtrmm_kernel_L4_M8_20
dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #4
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:
INIT4x4
@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40
dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_42:
.Ldtrmm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_42
bgt .Ldtrmm_kernel_L4_M4_42
dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:
INIT2x4
@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:
INIT1x4
@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif
mov pA, origPA // pA = A
dtrmm_kernel_L2_M8_BEGIN:
.Ldtrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L2_M4_BEGIN
ble .Ldtrmm_kernel_L2_M4_BEGIN
dtrmm_kernel_L2_M8_20:
.Ldtrmm_kernel_L2_M8_20:
INIT8x2
@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M8_40
ble .Ldtrmm_kernel_L2_M8_40
.align 5
dtrmm_kernel_L2_M8_22:
.Ldtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_22
bgt .Ldtrmm_kernel_L2_M8_22
dtrmm_kernel_L2_M8_40:
.Ldtrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M8_100
ble .Ldtrmm_kernel_L2_M8_100
dtrmm_kernel_L2_M8_42:
.Ldtrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_42
bgt .Ldtrmm_kernel_L2_M8_42
dtrmm_kernel_L2_M8_100:
.Ldtrmm_kernel_L2_M8_100:
SAVE8x2
@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8
#endif
dtrmm_kernel_L2_M8_END:
.Ldtrmm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M8_20
bgt .Ldtrmm_kernel_L2_M8_20
dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:
INIT4x2
@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5
dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:
dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:
INIT2x2
@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:
INIT1x2
@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/
dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A
dtrmm_kernel_L1_M8_BEGIN:
.Ldtrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L1_M4_BEGIN
ble .Ldtrmm_kernel_L1_M4_BEGIN
dtrmm_kernel_L1_M8_20:
.Ldtrmm_kernel_L1_M8_20:
INIT8x1
@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M8_40
ble .Ldtrmm_kernel_L1_M8_40
.align 5
dtrmm_kernel_L1_M8_22:
.Ldtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_22
bgt .Ldtrmm_kernel_L1_M8_22
dtrmm_kernel_L1_M8_40:
.Ldtrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M8_100
ble .Ldtrmm_kernel_L1_M8_100
dtrmm_kernel_L1_M8_42:
.Ldtrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_42
bgt .Ldtrmm_kernel_L1_M8_42
dtrmm_kernel_L1_M8_100:
.Ldtrmm_kernel_L1_M8_100:
SAVE8x1
@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8
#endif
dtrmm_kernel_L1_M8_END:
.Ldtrmm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M8_20
bgt .Ldtrmm_kernel_L1_M8_20
dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:
INIT4x1
@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5
dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:
dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:
INIT2x1
@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:
INIT1x1
@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999
cmp M, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N
cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN
bne .Lgemv_n_kernel_S_BEGIN
gemv_n_kernel_F_LOOP:
.Lgemv_n_kernel_F_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP:
mov Y_IPTR, Y
mov Y_OPTR, Y
gemv_n_kernel_F32:
.Lgemv_n_kernel_F32:
asr I, M, #5
cmp I, xzr
beq gemv_n_kernel_F4
beq .Lgemv_n_kernel_F4
gemv_n_kernel_F320:
.Lgemv_n_kernel_F320:
KERNEL_F16
KERNEL_F16
subs I, I, #1
bne gemv_n_kernel_F320
bne .Lgemv_n_kernel_F320
gemv_n_kernel_F4:
.Lgemv_n_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_n_kernel_F1
beq .Lgemv_n_kernel_F1
gemv_n_kernel_F40:
.Lgemv_n_kernel_F40:
KERNEL_F4
subs I, I, #1
bne gemv_n_kernel_F40
bne .Lgemv_n_kernel_F40
gemv_n_kernel_F1:
.Lgemv_n_kernel_F1:
ands I, M, #3
ble gemv_n_kernel_F_END
ble .Lgemv_n_kernel_F_END
gemv_n_kernel_F10:
.Lgemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
bne gemv_n_kernel_F10
bne .Lgemv_n_kernel_F10
gemv_n_kernel_F_END:
.Lgemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_F_LOOP
bne .Lgemv_n_kernel_F_LOOP
b gemv_n_kernel_L999
b .Lgemv_n_kernel_L999
gemv_n_kernel_S_BEGIN:
.Lgemv_n_kernel_S_BEGIN:
INIT_S
gemv_n_kernel_S_LOOP:
.Lgemv_n_kernel_S_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble gemv_n_kernel_S1
ble .Lgemv_n_kernel_S1
gemv_n_kernel_S4:
.Lgemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -298,27 +298,27 @@ gemv_n_kernel_S4:
KERNEL_S1
subs I, I, #1
bne gemv_n_kernel_S4
bne .Lgemv_n_kernel_S4
gemv_n_kernel_S1:
.Lgemv_n_kernel_S1:
ands I, M, #3
ble gemv_n_kernel_S_END
ble .Lgemv_n_kernel_S_END
gemv_n_kernel_S10:
.Lgemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
bne gemv_n_kernel_S10
bne .Lgemv_n_kernel_S10
gemv_n_kernel_S_END:
.Lgemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_S_LOOP
bne .Lgemv_n_kernel_S_LOOP
gemv_n_kernel_L999:
.Lgemv_n_kernel_L999:
mov w0, wzr

View File

@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999
cmp M, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N
cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN
bne .Lgemv_t_kernel_S_BEGIN
gemv_t_kernel_F_LOOP:
.Lgemv_t_kernel_F_LOOP:
fmov TEMP, REG0
fmov TEMP1, REG0
@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP:
mov A_PTR, A
mov X_PTR, X
gemv_t_kernel_F32:
.Lgemv_t_kernel_F32:
asr I, M, #5
cmp I, xzr
beq gemv_t_kernel_F4
beq .Lgemv_t_kernel_F4
gemv_t_kernel_F320:
.Lgemv_t_kernel_F320:
KERNEL_F32
subs I, I, #1
bne gemv_t_kernel_F320
bne .Lgemv_t_kernel_F320
KERNEL_F32_FINALIZE
gemv_t_kernel_F4:
.Lgemv_t_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_t_kernel_F1
beq .Lgemv_t_kernel_F1
gemv_t_kernel_F40:
.Lgemv_t_kernel_F40:
KERNEL_F4
subs I, I, #1
bne gemv_t_kernel_F40
bne .Lgemv_t_kernel_F40
gemv_t_kernel_F1:
.Lgemv_t_kernel_F1:
KERNEL_F4_FINALIZE
ands I, M, #3
ble gemv_t_kernel_F_END
ble .Lgemv_t_kernel_F_END
gemv_t_kernel_F10:
.Lgemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
bne gemv_t_kernel_F10
bne .Lgemv_t_kernel_F10
gemv_t_kernel_F_END:
.Lgemv_t_kernel_F_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP
bne .Lgemv_t_kernel_F_LOOP
b gemv_t_kernel_L999
b .Lgemv_t_kernel_L999
gemv_t_kernel_S_BEGIN:
.Lgemv_t_kernel_S_BEGIN:
INIT_S
gemv_t_kernel_S_LOOP:
.Lgemv_t_kernel_S_LOOP:
fmov TEMP, REG0
mov A_PTR, A
@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble gemv_t_kernel_S1
ble .Lgemv_t_kernel_S1
gemv_t_kernel_S4:
.Lgemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -329,30 +329,30 @@ gemv_t_kernel_S4:
KERNEL_S1
subs I, I, #1
bne gemv_t_kernel_S4
bne .Lgemv_t_kernel_S4
gemv_t_kernel_S1:
.Lgemv_t_kernel_S1:
ands I, M, #3
ble gemv_t_kernel_S_END
ble .Lgemv_t_kernel_S_END
gemv_t_kernel_S10:
.Lgemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
bne gemv_t_kernel_S10
bne .Lgemv_t_kernel_S10
gemv_t_kernel_S_END:
.Lgemv_t_kernel_S_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP
bne .Lgemv_t_kernel_S_LOOP
gemv_t_kernel_L999:
.Lgemv_t_kernel_L999:
RESTORE_REGS

View File

@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Liamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
.Liamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999
asr I, N, #3
cmp I, xzr
beq iamax_kernel_F1
beq .Liamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
.Liamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
bne .Liamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
.Liamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
ble .Liamax_kernel_L999
iamax_kernel_F10:
.Liamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
bne .Liamax_kernel_F10
b iamax_kernel_L999
b .Liamax_kernel_L999
iamax_kernel_S_BEGIN:
.Liamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
ble .Liamax_kernel_S1
iamax_kernel_S4:
.Liamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -293,25 +293,25 @@ iamax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
bne .Liamax_kernel_S4
iamax_kernel_S1:
.Liamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
ble .Liamax_kernel_L999
iamax_kernel_S10:
.Liamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
bne .Liamax_kernel_S10
iamax_kernel_L999:
.Liamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
.Liamax_kernel_zero:
mov x0, xzr
ret

View File

@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Lizamax_kernel_S_BEGIN
mov x7, X
iamax_kernel_F_BEGIN:
.Lizamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
asr I, N, #3
cmp I, xzr
ble iamax_kernel_F1
ble .Lizamax_kernel_F1
add Z, Z, #1
iamax_kernel_F8:
.Lizamax_kernel_F8:
KERNEL_F8
subs I, I, #1
bne iamax_kernel_F8
bne .Lizamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
iamax_kernel_F1:
.Lizamax_kernel_F1:
ands I, N, #7
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
iamax_kernel_F10:
.Lizamax_kernel_F10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_F10
bne .Lizamax_kernel_F10
b iamax_kernel_L999
b .Lizamax_kernel_L999
iamax_kernel_S_BEGIN:
.Lizamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
ble .Lizamax_kernel_S1
iamax_kernel_S4:
.Lizamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -341,26 +341,26 @@ iamax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
bne .Lizamax_kernel_S4
iamax_kernel_S1:
.Lizamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
ble .Lizamax_kernel_L999
iamax_kernel_S10:
.Lizamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
bne .Lizamax_kernel_S10
iamax_kernel_L999:
.Lizamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
.Lizamax_kernel_zero:
mov x0, xzr
ret

View File

@ -162,44 +162,44 @@ KERNEL_S1_NEXT:
INIT
cmp N, #0
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999
cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lnrm2_kernel_L999
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lnrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
.Lnrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, xzr
ble nrm2_kernel_F1
ble .Lnrm2_kernel_F1
nrm2_kernel_F8:
.Lnrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
bne .Lnrm2_kernel_F8
nrm2_kernel_F1:
.Lnrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999
nrm2_kernel_F10:
.Lnrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
bne .Lnrm2_kernel_F10
b nrm2_kernel_L999
b .Lnrm2_kernel_L999
nrm2_kernel_S_BEGIN:
.Lnrm2_kernel_S_BEGIN:
INIT_S
@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN:
.align 5
nrm2_kernel_S10:
.Lnrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
bne .Lnrm2_kernel_S10
nrm2_kernel_L999:
.Lnrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ

View File

@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble rot_kernel_L999
ble .Lrot_kernel_L999
INIT
cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN
rot_kernel_F_BEGIN:
.Lrot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
beq .Lrot_kernel_F1
KERNEL_INIT_F4
rot_kernel_F4:
.Lrot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne rot_kernel_F4
bne .Lrot_kernel_F4
rot_kernel_F1:
.Lrot_kernel_F1:
ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999
INIT_F1
rot_kernel_F10:
.Lrot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne rot_kernel_F10
bne .Lrot_kernel_F10
mov w0, wzr
ret
rot_kernel_S_BEGIN:
.Lrot_kernel_S_BEGIN:
INIT_S
INIT_F1
@ -214,9 +214,9 @@ rot_kernel_S_BEGIN:
asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
ble .Lrot_kernel_S1
rot_kernel_S4:
.Lrot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -224,22 +224,22 @@ rot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S4
bne .Lrot_kernel_S4
rot_kernel_S1:
.Lrot_kernel_S1:
ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999
rot_kernel_S10:
.Lrot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S10
bne .Lrot_kernel_S10
rot_kernel_L999:
.Lrot_kernel_L999:
mov w0, wzr
ret

View File

@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble scal_kernel_L999
ble .Lscal_kernel_L999
fcmp DA, #0.0
beq scal_kernel_zero
beq .Lscal_kernel_zero
cmp INC_X, #1
bne scal_kernel_S_BEGIN
bne .Lscal_kernel_S_BEGIN
scal_kernel_F_BEGIN:
.Lscal_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq scal_kernel_F1
beq .Lscal_kernel_F1
KERNEL_INIT_F8
scal_kernel_F8:
.Lscal_kernel_F8:
KERNEL_F8
subs I, I, #1
bne scal_kernel_F8
bne .Lscal_kernel_F8
scal_kernel_F1:
.Lscal_kernel_F1:
ands I, N, #7
ble scal_kernel_L999
ble .Lscal_kernel_L999
scal_kernel_F10:
.Lscal_kernel_F10:
KERNEL_F1
subs I, I, #1
bne scal_kernel_F10
bne .Lscal_kernel_F10
mov w0, wzr
ret
scal_kernel_S_BEGIN:
.Lscal_kernel_S_BEGIN:
INIT_S
mov X_COPY, X
asr I, N, #2
cmp I, xzr
ble scal_kernel_S1
ble .Lscal_kernel_S1
scal_kernel_S4:
.Lscal_kernel_S4:
KERNEL_S4
subs I, I, #1
bne scal_kernel_S4
bne .Lscal_kernel_S4
scal_kernel_S1:
.Lscal_kernel_S1:
ands I, N, #3
ble scal_kernel_L999
ble .Lscal_kernel_L999
scal_kernel_S10:
.Lscal_kernel_S10:
KERNEL_S1
subs I, I, #1
bne scal_kernel_S10
bne .Lscal_kernel_S10
scal_kernel_L999:
.Lscal_kernel_L999:
mov w0, wzr
ret
scal_kernel_zero:
.Lscal_kernel_zero:
INIT_S
scal_kernel_Z1:
.Lscal_kernel_Z1:
st1 DAV, [X], INC_X
subs N, N, #1
bne scal_kernel_Z1
bne .Lscal_kernel_Z1
mov w0, wzr
ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble sgemm_kernel_L2_BEGIN
ble .Lsgemm_kernel_L2_BEGIN
/******************************************************************************/
sgemm_kernel_L4_BEGIN:
.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN:
add pA_2, temp, pA_1
add pA_3, temp, pA_2
sgemm_kernel_L4_M16_BEGIN:
.Lsgemm_kernel_L4_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
ble sgemm_kernel_L4_M8_BEGIN
ble .Lsgemm_kernel_L4_M8_BEGIN
sgemm_kernel_L4_M16_20:
.Lsgemm_kernel_L4_M16_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt sgemm_kernel_L4_M16_32
blt .Lsgemm_kernel_L4_M16_32
KERNEL16x4_I // do one in the K
KERNEL16x4_M2 // do another in the K
subs counterL, counterL, #2
ble sgemm_kernel_L4_M16_22a
ble .Lsgemm_kernel_L4_M16_22a
.align 5
sgemm_kernel_L4_M16_22:
.Lsgemm_kernel_L4_M16_22:
KERNEL16x4_M1
KERNEL16x4_M2
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M16_22
bgt .Lsgemm_kernel_L4_M16_22
sgemm_kernel_L4_M16_22a:
.Lsgemm_kernel_L4_M16_22a:
KERNEL16x4_M1
KERNEL16x4_E
b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44
sgemm_kernel_L4_M16_32:
.Lsgemm_kernel_L4_M16_32:
tst counterL, #1
ble sgemm_kernel_L4_M16_40
ble .Lsgemm_kernel_L4_M16_40
KERNEL16x4_I
KERNEL16x4_E
b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44
sgemm_kernel_L4_M16_40:
.Lsgemm_kernel_L4_M16_40:
INIT16x4
sgemm_kernel_L4_M16_44:
.Lsgemm_kernel_L4_M16_44:
ands counterL , origK, #1
ble sgemm_kernel_L4_M16_100
ble .Lsgemm_kernel_L4_M16_100
sgemm_kernel_L4_M16_46:
.Lsgemm_kernel_L4_M16_46:
KERNEL16x4_SUB
sgemm_kernel_L4_M16_100:
.Lsgemm_kernel_L4_M16_100:
SAVE16x4
sgemm_kernel_L4_M16_END:
.Lsgemm_kernel_L4_M16_END:
lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
add pA_0, pA_0, temp
add pA_0, pA_0, temp
@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END:
add pA_2, pA_1, temp
add pA_3, pA_2, temp
subs counterI, counterI, #1
bne sgemm_kernel_L4_M16_20
bne .Lsgemm_kernel_L4_M16_20
sgemm_kernel_L4_M8_BEGIN:
.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
tst counterI , #15
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
tst counterI, #8
ble sgemm_kernel_L4_M4_BEGIN
ble .Lsgemm_kernel_L4_M4_BEGIN
sgemm_kernel_L4_M8_20:
.Lsgemm_kernel_L4_M8_20:
INIT8x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L4_M8_40
ble .Lsgemm_kernel_L4_M8_40
sgemm_kernel_L4_M8_22:
.Lsgemm_kernel_L4_M8_22:
KERNEL8x4_SUB
KERNEL8x4_SUB
@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22:
KERNEL8x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_22
bgt .Lsgemm_kernel_L4_M8_22
sgemm_kernel_L4_M8_40:
.Lsgemm_kernel_L4_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M8_100
ble .Lsgemm_kernel_L4_M8_100
sgemm_kernel_L4_M8_42:
.Lsgemm_kernel_L4_M8_42:
KERNEL8x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_42
bgt .Lsgemm_kernel_L4_M8_42
sgemm_kernel_L4_M8_100:
.Lsgemm_kernel_L4_M8_100:
SAVE8x4
sgemm_kernel_L4_M8_END:
.Lsgemm_kernel_L4_M8_END:
lsl temp, origK, #4 // k * 4 * 4
add pA_0, pA_0, temp
sgemm_kernel_L4_M4_BEGIN:
.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
tst counterI, #4
ble sgemm_kernel_L4_M2_BEGIN
ble .Lsgemm_kernel_L4_M2_BEGIN
sgemm_kernel_L4_M4_20:
.Lsgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L4_M4_40
ble .Lsgemm_kernel_L4_M4_40
sgemm_kernel_L4_M4_22:
.Lsgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_22
bgt .Lsgemm_kernel_L4_M4_22
sgemm_kernel_L4_M4_40:
.Lsgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M4_100
ble .Lsgemm_kernel_L4_M4_100
sgemm_kernel_L4_M4_42:
.Lsgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_42
bgt .Lsgemm_kernel_L4_M4_42
sgemm_kernel_L4_M4_100:
.Lsgemm_kernel_L4_M4_100:
SAVE4x4
sgemm_kernel_L4_M4_END:
.Lsgemm_kernel_L4_M4_END:
sgemm_kernel_L4_M2_BEGIN:
.Lsgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L4_M1_BEGIN
ble .Lsgemm_kernel_L4_M1_BEGIN
sgemm_kernel_L4_M2_20:
.Lsgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L4_M2_40
ble .Lsgemm_kernel_L4_M2_40
sgemm_kernel_L4_M2_22:
.Lsgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_22
bgt .Lsgemm_kernel_L4_M2_22
sgemm_kernel_L4_M2_40:
.Lsgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M2_100
ble .Lsgemm_kernel_L4_M2_100
sgemm_kernel_L4_M2_42:
.Lsgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_42
bgt .Lsgemm_kernel_L4_M2_42
sgemm_kernel_L4_M2_100:
.Lsgemm_kernel_L4_M2_100:
SAVE2x4
sgemm_kernel_L4_M2_END:
.Lsgemm_kernel_L4_M2_END:
sgemm_kernel_L4_M1_BEGIN:
.Lsgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END
sgemm_kernel_L4_M1_20:
.Lsgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L4_M1_40
ble .Lsgemm_kernel_L4_M1_40
sgemm_kernel_L4_M1_22:
.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_22
bgt .Lsgemm_kernel_L4_M1_22
sgemm_kernel_L4_M1_40:
.Lsgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M1_100
ble .Lsgemm_kernel_L4_M1_100
sgemm_kernel_L4_M1_42:
.Lsgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_42
bgt .Lsgemm_kernel_L4_M1_42
sgemm_kernel_L4_M1_100:
.Lsgemm_kernel_L4_M1_100:
SAVE1x4
sgemm_kernel_L4_END:
.Lsgemm_kernel_L4_END:
lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4
subs counterJ, counterJ , #1 // j--
bgt sgemm_kernel_L4_BEGIN
bgt .Lsgemm_kernel_L4_BEGIN
/******************************************************************************/
sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble sgemm_kernel_L999
ble .Lsgemm_kernel_L999
tst counterJ , #2
ble sgemm_kernel_L1_BEGIN
ble .Lsgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
sgemm_kernel_L2_M4_BEGIN:
.Lsgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble sgemm_kernel_L2_M2_BEGIN
ble .Lsgemm_kernel_L2_M2_BEGIN
sgemm_kernel_L2_M4_20:
.Lsgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble sgemm_kernel_L2_M4_40
ble .Lsgemm_kernel_L2_M4_40
.align 5
sgemm_kernel_L2_M4_22:
.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_22
bgt .Lsgemm_kernel_L2_M4_22
sgemm_kernel_L2_M4_40:
.Lsgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M4_100
ble .Lsgemm_kernel_L2_M4_100
sgemm_kernel_L2_M4_42:
.Lsgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_42
bgt .Lsgemm_kernel_L2_M4_42
sgemm_kernel_L2_M4_100:
.Lsgemm_kernel_L2_M4_100:
SAVE4x2
sgemm_kernel_L2_M4_END:
.Lsgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt sgemm_kernel_L2_M4_20
bgt .Lsgemm_kernel_L2_M4_20
sgemm_kernel_L2_M2_BEGIN:
.Lsgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L2_M1_BEGIN
ble .Lsgemm_kernel_L2_M1_BEGIN
sgemm_kernel_L2_M2_20:
.Lsgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble sgemm_kernel_L2_M2_40
ble .Lsgemm_kernel_L2_M2_40
sgemm_kernel_L2_M2_22:
.Lsgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_22
bgt .Lsgemm_kernel_L2_M2_22
sgemm_kernel_L2_M2_40:
.Lsgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M2_100
ble .Lsgemm_kernel_L2_M2_100
sgemm_kernel_L2_M2_42:
.Lsgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_42
bgt .Lsgemm_kernel_L2_M2_42
sgemm_kernel_L2_M2_100:
.Lsgemm_kernel_L2_M2_100:
SAVE2x2
sgemm_kernel_L2_M2_END:
.Lsgemm_kernel_L2_M2_END:
sgemm_kernel_L2_M1_BEGIN:
.Lsgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END
sgemm_kernel_L2_M1_20:
.Lsgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L2_M1_40
ble .Lsgemm_kernel_L2_M1_40
sgemm_kernel_L2_M1_22:
.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_22
bgt .Lsgemm_kernel_L2_M1_22
sgemm_kernel_L2_M1_40:
.Lsgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M1_100
ble .Lsgemm_kernel_L2_M1_100
sgemm_kernel_L2_M1_42:
.Lsgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_42
bgt .Lsgemm_kernel_L2_M1_42
sgemm_kernel_L2_M1_100:
.Lsgemm_kernel_L2_M1_100:
SAVE1x2
sgemm_kernel_L2_END:
.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
sgemm_kernel_L1_BEGIN:
.Lsgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble sgemm_kernel_L999 // done
ble .Lsgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN:
sgemm_kernel_L1_M4_BEGIN:
.Lsgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble sgemm_kernel_L1_M2_BEGIN
ble .Lsgemm_kernel_L1_M2_BEGIN
sgemm_kernel_L1_M4_20:
.Lsgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M4_40
ble .Lsgemm_kernel_L1_M4_40
.align 5
sgemm_kernel_L1_M4_22:
.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_22
bgt .Lsgemm_kernel_L1_M4_22
sgemm_kernel_L1_M4_40:
.Lsgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M4_100
ble .Lsgemm_kernel_L1_M4_100
sgemm_kernel_L1_M4_42:
.Lsgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_42
bgt .Lsgemm_kernel_L1_M4_42
sgemm_kernel_L1_M4_100:
.Lsgemm_kernel_L1_M4_100:
SAVE4x1
sgemm_kernel_L1_M4_END:
.Lsgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt sgemm_kernel_L1_M4_20
bgt .Lsgemm_kernel_L1_M4_20
sgemm_kernel_L1_M2_BEGIN:
.Lsgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L1_M1_BEGIN
ble .Lsgemm_kernel_L1_M1_BEGIN
sgemm_kernel_L1_M2_20:
.Lsgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M2_40
ble .Lsgemm_kernel_L1_M2_40
sgemm_kernel_L1_M2_22:
.Lsgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_22
bgt .Lsgemm_kernel_L1_M2_22
sgemm_kernel_L1_M2_40:
.Lsgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M2_100
ble .Lsgemm_kernel_L1_M2_100
sgemm_kernel_L1_M2_42:
.Lsgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_42
bgt .Lsgemm_kernel_L1_M2_42
sgemm_kernel_L1_M2_100:
.Lsgemm_kernel_L1_M2_100:
SAVE2x1
sgemm_kernel_L1_M2_END:
.Lsgemm_kernel_L1_M2_END:
sgemm_kernel_L1_M1_BEGIN:
.Lsgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END
sgemm_kernel_L1_M1_20:
.Lsgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M1_40
ble .Lsgemm_kernel_L1_M1_40
sgemm_kernel_L1_M1_22:
.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_22
bgt .Lsgemm_kernel_L1_M1_22
sgemm_kernel_L1_M1_40:
.Lsgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M1_100
ble .Lsgemm_kernel_L1_M1_100
sgemm_kernel_L1_M1_42:
.Lsgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_42
bgt .Lsgemm_kernel_L1_M1_42
sgemm_kernel_L1_M1_100:
.Lsgemm_kernel_L1_M1_100:
SAVE1x1
sgemm_kernel_L1_END:
.Lsgemm_kernel_L1_END:
sgemm_kernel_L999:
.Lsgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
strmm_kernel_begin:
.Lstrmm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
@ -539,11 +539,11 @@ strmm_kernel_begin:
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble strmm_kernel_L2_BEGIN
ble .Lstrmm_kernel_L2_BEGIN
/******************************************************************************/
strmm_kernel_L4_BEGIN:
.Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
strmm_kernel_L4_M4_BEGIN:
.Lstrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble strmm_kernel_L4_M2_BEGIN
ble .Lstrmm_kernel_L4_M2_BEGIN
strmm_kernel_L4_M4_20:
.Lstrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt strmm_kernel_L4_M4_32
blt .Lstrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
ble strmm_kernel_L4_M4_22a
ble .Lstrmm_kernel_L4_M4_22a
.align 5
strmm_kernel_L4_M4_22:
.Lstrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
bgt strmm_kernel_L4_M4_22
bgt .Lstrmm_kernel_L4_M4_22
strmm_kernel_L4_M4_22a:
.Lstrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44
strmm_kernel_L4_M4_32:
.Lstrmm_kernel_L4_M4_32:
tst counterL, #1
ble strmm_kernel_L4_M4_40
ble .Lstrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44
strmm_kernel_L4_M4_40:
.Lstrmm_kernel_L4_M4_40:
INIT4x4
strmm_kernel_L4_M4_44:
.Lstrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble strmm_kernel_L4_M4_100
ble .Lstrmm_kernel_L4_M4_100
strmm_kernel_L4_M4_46:
.Lstrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
strmm_kernel_L4_M4_100:
.Lstrmm_kernel_L4_M4_100:
SAVE4x4
@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif
strmm_kernel_L4_M4_END:
.Lstrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne strmm_kernel_L4_M4_20
bne .Lstrmm_kernel_L4_M4_20
strmm_kernel_L4_M2_BEGIN:
.Lstrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L4_M1_BEGIN
ble .Lstrmm_kernel_L4_M1_BEGIN
strmm_kernel_L4_M2_20:
.Lstrmm_kernel_L4_M2_20:
INIT2x4
@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L4_M2_40
ble .Lstrmm_kernel_L4_M2_40
strmm_kernel_L4_M2_22:
.Lstrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_22
bgt .Lstrmm_kernel_L4_M2_22
strmm_kernel_L4_M2_40:
.Lstrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M2_100
ble .Lstrmm_kernel_L4_M2_100
strmm_kernel_L4_M2_42:
.Lstrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_42
bgt .Lstrmm_kernel_L4_M2_42
strmm_kernel_L4_M2_100:
.Lstrmm_kernel_L4_M2_100:
SAVE2x4
@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100:
#endif
strmm_kernel_L4_M2_END:
.Lstrmm_kernel_L4_M2_END:
strmm_kernel_L4_M1_BEGIN:
.Lstrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END
strmm_kernel_L4_M1_20:
.Lstrmm_kernel_L4_M1_20:
INIT1x4
@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L4_M1_40
ble .Lstrmm_kernel_L4_M1_40
strmm_kernel_L4_M1_22:
.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_22
bgt .Lstrmm_kernel_L4_M1_22
strmm_kernel_L4_M1_40:
.Lstrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M1_100
ble .Lstrmm_kernel_L4_M1_100
strmm_kernel_L4_M1_42:
.Lstrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_42
bgt .Lstrmm_kernel_L4_M1_42
strmm_kernel_L4_M1_100:
.Lstrmm_kernel_L4_M1_100:
SAVE1x4
@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100:
#endif
strmm_kernel_L4_END:
.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
#if !defined(LEFT)
@ -825,19 +825,19 @@ strmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt strmm_kernel_L4_BEGIN
bgt .Lstrmm_kernel_L4_BEGIN
/******************************************************************************/
strmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble strmm_kernel_L999
ble .Lstrmm_kernel_L999
tst counterJ , #2
ble strmm_kernel_L1_BEGIN
ble .Lstrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
strmm_kernel_L2_M4_BEGIN:
.Lstrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble strmm_kernel_L2_M2_BEGIN
ble .Lstrmm_kernel_L2_M2_BEGIN
strmm_kernel_L2_M4_20:
.Lstrmm_kernel_L2_M4_20:
INIT4x2
@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble strmm_kernel_L2_M4_40
ble .Lstrmm_kernel_L2_M4_40
.align 5
strmm_kernel_L2_M4_22:
.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_22
bgt .Lstrmm_kernel_L2_M4_22
strmm_kernel_L2_M4_40:
.Lstrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M4_100
ble .Lstrmm_kernel_L2_M4_100
strmm_kernel_L2_M4_42:
.Lstrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_42
bgt .Lstrmm_kernel_L2_M4_42
strmm_kernel_L2_M4_100:
.Lstrmm_kernel_L2_M4_100:
SAVE4x2
@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
strmm_kernel_L2_M4_END:
.Lstrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt strmm_kernel_L2_M4_20
bgt .Lstrmm_kernel_L2_M4_20
strmm_kernel_L2_M2_BEGIN:
.Lstrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L2_M1_BEGIN
ble .Lstrmm_kernel_L2_M1_BEGIN
strmm_kernel_L2_M2_20:
.Lstrmm_kernel_L2_M2_20:
INIT2x2
@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble strmm_kernel_L2_M2_40
ble .Lstrmm_kernel_L2_M2_40
strmm_kernel_L2_M2_22:
.Lstrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_22
bgt .Lstrmm_kernel_L2_M2_22
strmm_kernel_L2_M2_40:
.Lstrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M2_100
ble .Lstrmm_kernel_L2_M2_100
strmm_kernel_L2_M2_42:
.Lstrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_42
bgt .Lstrmm_kernel_L2_M2_42
strmm_kernel_L2_M2_100:
.Lstrmm_kernel_L2_M2_100:
SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
strmm_kernel_L2_M2_END:
.Lstrmm_kernel_L2_M2_END:
strmm_kernel_L2_M1_BEGIN:
.Lstrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END
strmm_kernel_L2_M1_20:
.Lstrmm_kernel_L2_M1_20:
INIT1x2
@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble strmm_kernel_L2_M1_40
ble .Lstrmm_kernel_L2_M1_40
strmm_kernel_L2_M1_22:
.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_22
bgt .Lstrmm_kernel_L2_M1_22
strmm_kernel_L2_M1_40:
.Lstrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M1_100
ble .Lstrmm_kernel_L2_M1_100
strmm_kernel_L2_M1_42:
.Lstrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_42
bgt .Lstrmm_kernel_L2_M1_42
strmm_kernel_L2_M1_100:
.Lstrmm_kernel_L2_M1_100:
SAVE1x2
@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif
strmm_kernel_L2_END:
.Lstrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1107,11 +1107,11 @@ strmm_kernel_L2_END:
/******************************************************************************/
strmm_kernel_L1_BEGIN:
.Lstrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble strmm_kernel_L999 // done
ble .Lstrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A
strmm_kernel_L1_M4_BEGIN:
.Lstrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble strmm_kernel_L1_M2_BEGIN
ble .Lstrmm_kernel_L1_M2_BEGIN
strmm_kernel_L1_M4_20:
.Lstrmm_kernel_L1_M4_20:
INIT4x1
@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M4_40
ble .Lstrmm_kernel_L1_M4_40
.align 5
strmm_kernel_L1_M4_22:
.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_22
bgt .Lstrmm_kernel_L1_M4_22
strmm_kernel_L1_M4_40:
.Lstrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M4_100
ble .Lstrmm_kernel_L1_M4_100
strmm_kernel_L1_M4_42:
.Lstrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_42
bgt .Lstrmm_kernel_L1_M4_42
strmm_kernel_L1_M4_100:
.Lstrmm_kernel_L1_M4_100:
SAVE4x1
@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
strmm_kernel_L1_M4_END:
.Lstrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt strmm_kernel_L1_M4_20
bgt .Lstrmm_kernel_L1_M4_20
strmm_kernel_L1_M2_BEGIN:
.Lstrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L1_M1_BEGIN
ble .Lstrmm_kernel_L1_M1_BEGIN
strmm_kernel_L1_M2_20:
.Lstrmm_kernel_L1_M2_20:
INIT2x1
@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M2_40
ble .Lstrmm_kernel_L1_M2_40
strmm_kernel_L1_M2_22:
.Lstrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_22
bgt .Lstrmm_kernel_L1_M2_22
strmm_kernel_L1_M2_40:
.Lstrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M2_100
ble .Lstrmm_kernel_L1_M2_100
strmm_kernel_L1_M2_42:
.Lstrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_42
bgt .Lstrmm_kernel_L1_M2_42
strmm_kernel_L1_M2_100:
.Lstrmm_kernel_L1_M2_100:
SAVE2x1
@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100:
#endif
strmm_kernel_L1_M2_END:
.Lstrmm_kernel_L1_M2_END:
strmm_kernel_L1_M1_BEGIN:
.Lstrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END
strmm_kernel_L1_M1_20:
.Lstrmm_kernel_L1_M1_20:
INIT1x1
@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M1_40
ble .Lstrmm_kernel_L1_M1_40
strmm_kernel_L1_M1_22:
.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_22
bgt .Lstrmm_kernel_L1_M1_22
strmm_kernel_L1_M1_40:
.Lstrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M1_100
ble .Lstrmm_kernel_L1_M1_100
strmm_kernel_L1_M1_42:
.Lstrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_42
bgt .Lstrmm_kernel_L1_M1_42
strmm_kernel_L1_M1_100:
.Lstrmm_kernel_L1_M1_100:
SAVE1x1
@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100:
#endif
#endif
strmm_kernel_L1_END:
.Lstrmm_kernel_L1_END:
#if 0
#if !defined(LEFT)
@ -1385,7 +1385,7 @@ strmm_kernel_L1_END:
#endif
#endif
strmm_kernel_L999:
.Lstrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

File diff suppressed because it is too large Load Diff

View File

@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble swap_kernel_L999
ble .Lswap_kernel_L999
cmp INC_X, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
swap_kernel_F_BEGIN:
.Lswap_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq swap_kernel_F1
beq .Lswap_kernel_F1
swap_kernel_F8:
.Lswap_kernel_F8:
KERNEL_F8
subs I, I, #1
bne swap_kernel_F8
bne .Lswap_kernel_F8
swap_kernel_F1:
.Lswap_kernel_F1:
ands I, N, #7
ble swap_kernel_L999
ble .Lswap_kernel_L999
swap_kernel_F10:
.Lswap_kernel_F10:
KERNEL_F1
subs I, I, #1
bne swap_kernel_F10
bne .Lswap_kernel_F10
b swap_kernel_L999
b .Lswap_kernel_L999
swap_kernel_S_BEGIN:
.Lswap_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble swap_kernel_S1
ble .Lswap_kernel_S1
swap_kernel_S4:
.Lswap_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -244,21 +244,21 @@ swap_kernel_S4:
KERNEL_S1
subs I, I, #1
bne swap_kernel_S4
bne .Lswap_kernel_S4
swap_kernel_S1:
.Lswap_kernel_S1:
ands I, N, #3
ble swap_kernel_L999
ble .Lswap_kernel_L999
swap_kernel_S10:
.Lswap_kernel_S10:
KERNEL_S1
subs I, I, #1
bne swap_kernel_S10
bne .Lswap_kernel_S10
swap_kernel_L999:
.Lswap_kernel_L999:
mov w0, wzr
ret

View File

@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero
cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lzamax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
.Lzamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lzamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq amax_kernel_F1
beq .Lzamax_kernel_F1
amax_kernel_F4:
.Lzamax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne amax_kernel_F4
bne .Lzamax_kernel_F4
amax_kernel_F1:
.Lzamax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999
amax_kernel_F10:
.Lzamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
bne .Lzamax_kernel_F10
ret
amax_kernel_F1_INIT:
.Lzamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b amax_kernel_F1
b .Lzamax_kernel_F1
amax_kernel_S_BEGIN:
.Lzamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble amax_kernel_L999
ble .Lzamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
ble .Lzamax_kernel_S1
amax_kernel_S4:
.Lzamax_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -247,25 +247,25 @@ amax_kernel_S4:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
bne .Lzamax_kernel_S4
amax_kernel_S1:
.Lzamax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999
amax_kernel_S10:
.Lzamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
bne .Lzamax_kernel_S10
amax_kernel_L999:
.Lzamax_kernel_L999:
ret
amax_kernel_zero:
.Lzamax_kernel_zero:
fmov MAXF, REG0
ret

View File

@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov SUMF, REG0
cmp N, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lzasum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
.Lzasum_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq asum_kernel_F1
beq .Lzasum_kernel_F1
asum_kernel_F4:
.Lzasum_kernel_F4:
KERNEL_F4
subs I, I, #1
bne asum_kernel_F4
bne .Lzasum_kernel_F4
KERNEL_F4_FINALIZE
asum_kernel_F1:
.Lzasum_kernel_F1:
ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999
asum_kernel_F10:
.Lzasum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
bne .Lzasum_kernel_F10
asum_kernel_L999:
.Lzasum_kernel_L999:
ret
asum_kernel_S_BEGIN:
.Lzasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lzasum_kernel_S1
asum_kernel_S4:
.Lzasum_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -145,19 +145,19 @@ asum_kernel_S4:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
bne .Lzasum_kernel_S4
asum_kernel_S1:
.Lzasum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999
asum_kernel_S10:
.Lzasum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
bne .Lzasum_kernel_S10
ret

View File

@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999
mov Y_COPY, Y
fcmp DA_R, #0.0
bne .L1
fcmp DA_I, #0.0
beq zaxpy_kernel_L999
beq .Lzaxpy_kernel_L999
.L1:
INIT
cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN
zaxpy_kernel_F_BEGIN:
.Lzaxpy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq zaxpy_kernel_F1
beq .Lzaxpy_kernel_F1
KERNEL_INIT_F4
zaxpy_kernel_F4:
.Lzaxpy_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zaxpy_kernel_F4
bne .Lzaxpy_kernel_F4
zaxpy_kernel_F1:
.Lzaxpy_kernel_F1:
ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999
zaxpy_kernel_F10:
.Lzaxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zaxpy_kernel_F10
bne .Lzaxpy_kernel_F10
mov w0, wzr
ret
zaxpy_kernel_S_BEGIN:
.Lzaxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble zaxpy_kernel_S1
ble .Lzaxpy_kernel_S1
zaxpy_kernel_S4:
.Lzaxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -304,21 +304,21 @@ zaxpy_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zaxpy_kernel_S4
bne .Lzaxpy_kernel_S4
zaxpy_kernel_S1:
.Lzaxpy_kernel_S1:
ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999
zaxpy_kernel_S10:
.Lzaxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zaxpy_kernel_S10
bne .Lzaxpy_kernel_S10
zaxpy_kernel_L999:
.Lzaxpy_kernel_L999:
mov w0, wzr
ret

View File

@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
cmp N, xzr
ble dot_kernel_L999
ble .Lzdot_kernel_L999
cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN
dot_kernel_F_BEGIN:
.Lzdot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
beq .Lzdot_kernel_F1
dot_kernel_F4:
.Lzdot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne dot_kernel_F4
bne .Lzdot_kernel_F4
KERNEL_F4_FINALIZE
dot_kernel_F1:
.Lzdot_kernel_F1:
ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999
dot_kernel_F10:
.Lzdot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne dot_kernel_F10
bne .Lzdot_kernel_F10
ret
dot_kernel_S_BEGIN:
.Lzdot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
ble .Lzdot_kernel_S1
dot_kernel_S4:
.Lzdot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -281,21 +281,21 @@ dot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S4
bne .Lzdot_kernel_S4
dot_kernel_S1:
.Lzdot_kernel_S1:
ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999
dot_kernel_S10:
.Lzdot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S10
bne .Lzdot_kernel_S10
dot_kernel_L999:
.Lzdot_kernel_L999:
ret

View File

@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a
.align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22
.align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
.align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:
INIT4x4
zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100
.align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4
zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20
zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN
zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40
zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22
zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100
zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42
zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:
SAVE2x4
zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:
zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40
zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22
zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100
zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42
zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:
SAVE1x4
zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/
zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999
tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN
zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5
zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22
zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100
zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42
zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:
SAVE4x2
zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20
zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN
zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40
zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22
zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100
zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42
zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:
SAVE2x2
zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:
zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40
zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22
zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100
zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42
zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:
SAVE1x2
zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/
zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN:
zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN
zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5
zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22
zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100
zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42
zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:
SAVE4x1
zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20
zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN
zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40
zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22
zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100
zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42
zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:
SAVE2x1
zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:
zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40
zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22
zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100
zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42
zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:
SAVE1x1
zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:
zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a
.align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22
.align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
.align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:
tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E
b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44
zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:
INIT4x4
zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100
.align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4
zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20
zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN
zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40
zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22
zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100
zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42
zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:
SAVE2x4
zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:
zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END
zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40
zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22
zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100
zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42
zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:
SAVE1x4
zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/
zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999
tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN
zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5
zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22
zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100
zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42
zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:
SAVE4x2
zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20
zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN
zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40
zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22
zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100
zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42
zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:
SAVE2x2
zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:
zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END
zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40
zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22
zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100
zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42
zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:
SAVE1x2
zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/
zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN:
zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN
zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5
zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22
zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100
zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42
zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:
SAVE4x1
zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20
zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN
zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40
zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22
zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100
zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42
zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:
SAVE2x1
zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:
zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END
zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40
zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22
zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100
zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42
zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:
SAVE1x1
zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:
zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]

View File

@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999
cmp M, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT
cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN
bne .Lzgemv_n_kernel_S_BEGIN
zgemv_n_kernel_F_LOOP:
.Lzgemv_n_kernel_F_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP:
asr I, M, #2
cmp I, xzr
beq zgemv_n_kernel_F1
beq .Lzgemv_n_kernel_F1
zgemv_n_kernel_F4:
.Lzgemv_n_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zgemv_n_kernel_F4
bne .Lzgemv_n_kernel_F4
zgemv_n_kernel_F1:
.Lzgemv_n_kernel_F1:
ands I, M, #3
ble zgemv_n_kernel_F_END
ble .Lzgemv_n_kernel_F_END
zgemv_n_kernel_F10:
.Lzgemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zgemv_n_kernel_F10
bne .Lzgemv_n_kernel_F10
zgemv_n_kernel_F_END:
.Lzgemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_F_LOOP
bne .Lzgemv_n_kernel_F_LOOP
b zgemv_n_kernel_L999
b .Lzgemv_n_kernel_L999
zgemv_n_kernel_S_BEGIN:
.Lzgemv_n_kernel_S_BEGIN:
INIT_S
zgemv_n_kernel_S_LOOP:
.Lzgemv_n_kernel_S_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble zgemv_n_kernel_S1
ble .Lzgemv_n_kernel_S1
zgemv_n_kernel_S4:
.Lzgemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -440,27 +440,27 @@ zgemv_n_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zgemv_n_kernel_S4
bne .Lzgemv_n_kernel_S4
zgemv_n_kernel_S1:
.Lzgemv_n_kernel_S1:
ands I, M, #3
ble zgemv_n_kernel_S_END
ble .Lzgemv_n_kernel_S_END
zgemv_n_kernel_S10:
.Lzgemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zgemv_n_kernel_S10
bne .Lzgemv_n_kernel_S10
zgemv_n_kernel_S_END:
.Lzgemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_S_LOOP
bne .Lzgemv_n_kernel_S_LOOP
zgemv_n_kernel_L999:
.Lzgemv_n_kernel_L999:
RESTORE_REGS
mov w0, wzr

View File

@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS
cmp N, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999
cmp M, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT
cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN
bne .Lzgemv_t_kernel_S_BEGIN
zgemv_t_kernel_F_LOOP:
.Lzgemv_t_kernel_F_LOOP:
mov A_PTR, A
mov X_PTR, X
@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP:
asr I, M, #2
cmp I, xzr
beq zgemv_t_kernel_F1
beq .Lzgemv_t_kernel_F1
zgemv_t_kernel_F4:
.Lzgemv_t_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zgemv_t_kernel_F4
bne .Lzgemv_t_kernel_F4
KERNEL_F4_FINALIZE
zgemv_t_kernel_F1:
.Lzgemv_t_kernel_F1:
ands I, M, #3
ble zgemv_t_kernel_F_END
ble .Lzgemv_t_kernel_F_END
zgemv_t_kernel_F10:
.Lzgemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zgemv_t_kernel_F10
bne .Lzgemv_t_kernel_F10
zgemv_t_kernel_F_END:
.Lzgemv_t_kernel_F_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
@ -355,15 +355,15 @@ zgemv_t_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_F_LOOP
bne .Lzgemv_t_kernel_F_LOOP
b zgemv_t_kernel_L999
b .Lzgemv_t_kernel_L999
zgemv_t_kernel_S_BEGIN:
.Lzgemv_t_kernel_S_BEGIN:
INIT_S
zgemv_t_kernel_S_LOOP:
.Lzgemv_t_kernel_S_LOOP:
mov A_PTR, A
mov X_PTR, X
@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP:
asr I, M, #2
cmp I, xzr
ble zgemv_t_kernel_S1
ble .Lzgemv_t_kernel_S1
zgemv_t_kernel_S4:
.Lzgemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -381,21 +381,21 @@ zgemv_t_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zgemv_t_kernel_S4
bne .Lzgemv_t_kernel_S4
zgemv_t_kernel_S1:
.Lzgemv_t_kernel_S1:
ands I, M, #3
ble zgemv_t_kernel_S_END
ble .Lzgemv_t_kernel_S_END
zgemv_t_kernel_S10:
.Lzgemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zgemv_t_kernel_S10
bne .Lzgemv_t_kernel_S10
zgemv_t_kernel_S_END:
.Lzgemv_t_kernel_S_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
@ -413,9 +413,9 @@ zgemv_t_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_S_LOOP
bne .Lzgemv_t_kernel_S_LOOP
zgemv_t_kernel_L999:
.Lzgemv_t_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret

View File

@ -226,43 +226,43 @@ KERNEL_S1_END_\@:
INIT
cmp N, #0
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999
cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lznrm2_kernel_L999
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lznrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
.Lznrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, xzr
ble nrm2_kernel_F1
ble .Lznrm2_kernel_F1
nrm2_kernel_F8:
.Lznrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
bne .Lznrm2_kernel_F8
nrm2_kernel_F1:
.Lznrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999
nrm2_kernel_F10:
.Lznrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
bne .Lznrm2_kernel_F10
b nrm2_kernel_L999
b .Lznrm2_kernel_L999
nrm2_kernel_S_BEGIN:
.Lznrm2_kernel_S_BEGIN:
INIT_S
@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN:
.align 5
nrm2_kernel_S10:
.Lznrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
bne .Lznrm2_kernel_S10
nrm2_kernel_L999:
.Lznrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ

View File

@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE
cmp N, xzr
ble rot_kernel_L999
ble .Lzrot_kernel_L999
INIT
cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN
rot_kernel_F_BEGIN:
.Lzrot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
beq .Lzrot_kernel_F1
KERNEL_INIT_F4
rot_kernel_F4:
.Lzrot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne rot_kernel_F4
bne .Lzrot_kernel_F4
rot_kernel_F1:
.Lzrot_kernel_F1:
ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999
rot_kernel_F10:
.Lzrot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne rot_kernel_F10
bne .Lzrot_kernel_F10
mov w0, wzr
ret
rot_kernel_S_BEGIN:
.Lzrot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
ble .Lzrot_kernel_S1
rot_kernel_S4:
.Lzrot_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -236,21 +236,21 @@ rot_kernel_S4:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S4
bne .Lzrot_kernel_S4
rot_kernel_S1:
.Lzrot_kernel_S1:
ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999
rot_kernel_S10:
.Lzrot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S10
bne .Lzrot_kernel_S10
rot_kernel_L999:
.Lzrot_kernel_L999:
mov w0, wzr
ret

View File

@ -215,71 +215,71 @@ zscal_begin:
mov X_COPY, X
cmp N, xzr
ble zscal_kernel_L999
ble .Lzscal_kernel_L999
fcmp DA_R, #0.0
bne zscal_kernel_R_non_zero
bne .Lzscal_kernel_R_non_zero
fcmp DA_I, #0.0
beq zscal_kernel_RI_zero
beq .Lzscal_kernel_RI_zero
b zscal_kernel_R_zero
b .Lzscal_kernel_R_zero
zscal_kernel_R_non_zero:
.Lzscal_kernel_R_non_zero:
fcmp DA_I, #0.0
beq zscal_kernel_I_zero
beq .Lzscal_kernel_I_zero
/*******************************************************************************
* A_R != 0 && A_I != 0
*******************************************************************************/
zscal_kernel_RI_non_zero:
.Lzscal_kernel_RI_non_zero:
INIT
cmp INC_X, #1
bne zscal_kernel_S_BEGIN
bne .Lzscal_kernel_S_BEGIN
zscal_kernel_F_BEGIN:
.Lzscal_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq zscal_kernel_F1
beq .Lzscal_kernel_F1
KERNEL_INIT_F4
zscal_kernel_F4:
.Lzscal_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zscal_kernel_F4
bne .Lzscal_kernel_F4
zscal_kernel_F1:
.Lzscal_kernel_F1:
ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999
zscal_kernel_F10:
.Lzscal_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zscal_kernel_F10
bne .Lzscal_kernel_F10
mov w0, wzr
ret
zscal_kernel_S_BEGIN:
.Lzscal_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble zscal_kernel_S1
ble .Lzscal_kernel_S1
zscal_kernel_S4:
.Lzscal_kernel_S4:
KERNEL_S1
KERNEL_S1
@ -287,21 +287,21 @@ zscal_kernel_S4:
KERNEL_S1
subs I, I, #1
bne zscal_kernel_S4
bne .Lzscal_kernel_S4
zscal_kernel_S1:
.Lzscal_kernel_S1:
ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999
zscal_kernel_S10:
.Lzscal_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zscal_kernel_S10
bne .Lzscal_kernel_S10
zscal_kernel_L999:
.Lzscal_kernel_L999:
mov w0, wzr
ret
@ -310,7 +310,7 @@ zscal_kernel_L999:
* A_R == 0 && A_I != 0
*******************************************************************************/
zscal_kernel_R_zero:
.Lzscal_kernel_R_zero:
INIT_S
#if !defined(DOUBLE)
@ -323,7 +323,7 @@ zscal_kernel_R_zero:
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif
zscal_kernel_R_zero_1:
.Lzscal_kernel_R_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
@ -337,7 +337,7 @@ zscal_kernel_R_zero_1:
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_R_zero_1
bne .Lzscal_kernel_R_zero_1
mov w0, wzr
ret
@ -346,7 +346,7 @@ zscal_kernel_R_zero_1:
* A_R != 0 && A_I == 0
*******************************************************************************/
zscal_kernel_I_zero:
.Lzscal_kernel_I_zero:
INIT_S
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
@ -354,7 +354,7 @@ zscal_kernel_I_zero:
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif
zscal_kernel_I_zero_1:
.Lzscal_kernel_I_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
@ -366,7 +366,7 @@ zscal_kernel_I_zero_1:
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_I_zero_1
bne .Lzscal_kernel_I_zero_1
mov w0, wzr
ret
@ -375,16 +375,16 @@ zscal_kernel_I_zero_1:
* A_R == 0 && A_I == 0
*******************************************************************************/
zscal_kernel_RI_zero:
.Lzscal_kernel_RI_zero:
INIT_S
zscal_kernel_RI_zero_1:
.Lzscal_kernel_RI_zero_1:
stp DA_R, DA_I, [X]
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_RI_zero_1
bne .Lzscal_kernel_RI_zero_1
mov w0, wzr
ret

View File

@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ztrmm_kernel_L2_BEGIN
ble .Lztrmm_kernel_L2_BEGIN
ztrmm_kernel_L4_BEGIN:
.Lztrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array
ztrmm_kernel_L4_M4_BEGIN:
.Lztrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
ble .Lztrmm_kernel_L4_M2_BEGIN
.align 5
ztrmm_kernel_L4_M4_20:
.Lztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20:
asr counterL , tempK, #3
cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
blt .Lztrmm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20:
KERNEL4x4_M2
subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
ble .Lztrmm_kernel_L4_M4_22a
.align 5
ztrmm_kernel_L4_M4_22:
.Lztrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22:
KERNEL4x4_M2
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
bgt .Lztrmm_kernel_L4_M4_22
.align 5
ztrmm_kernel_L4_M4_22a:
.Lztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44
.align 5
ztrmm_kernel_L4_M4_32:
.Lztrmm_kernel_L4_M4_32:
tst counterL, #1
ble ztrmm_kernel_L4_M4_40
ble .Lztrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E
b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44
ztrmm_kernel_L4_M4_40:
.Lztrmm_kernel_L4_M4_40:
INIT4x4
ztrmm_kernel_L4_M4_44:
.Lztrmm_kernel_L4_M4_44:
ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
ble .Lztrmm_kernel_L4_M4_100
.align 5
ztrmm_kernel_L4_M4_46:
.Lztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
bne .Lztrmm_kernel_L4_M4_46
ztrmm_kernel_L4_M4_100:
.Lztrmm_kernel_L4_M4_100:
SAVE4x4
@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
ztrmm_kernel_L4_M4_END:
.Lztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20
bne .Lztrmm_kernel_L4_M4_20
ztrmm_kernel_L4_M2_BEGIN:
.Lztrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L4_M1_BEGIN
ble .Lztrmm_kernel_L4_M1_BEGIN
ztrmm_kernel_L4_M2_20:
.Lztrmm_kernel_L4_M2_20:
INIT2x4
@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L4_M2_40
ble .Lztrmm_kernel_L4_M2_40
ztrmm_kernel_L4_M2_22:
.Lztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_22
bgt .Lztrmm_kernel_L4_M2_22
ztrmm_kernel_L4_M2_40:
.Lztrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M2_100
ble .Lztrmm_kernel_L4_M2_100
ztrmm_kernel_L4_M2_42:
.Lztrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_42
bgt .Lztrmm_kernel_L4_M2_42
ztrmm_kernel_L4_M2_100:
.Lztrmm_kernel_L4_M2_100:
SAVE2x4
@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif
ztrmm_kernel_L4_M2_END:
.Lztrmm_kernel_L4_M2_END:
ztrmm_kernel_L4_M1_BEGIN:
.Lztrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END
ztrmm_kernel_L4_M1_20:
.Lztrmm_kernel_L4_M1_20:
INIT1x4
@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L4_M1_40
ble .Lztrmm_kernel_L4_M1_40
ztrmm_kernel_L4_M1_22:
.Lztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_22
bgt .Lztrmm_kernel_L4_M1_22
ztrmm_kernel_L4_M1_40:
.Lztrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M1_100
ble .Lztrmm_kernel_L4_M1_100
ztrmm_kernel_L4_M1_42:
.Lztrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_42
bgt .Lztrmm_kernel_L4_M1_42
ztrmm_kernel_L4_M1_100:
.Lztrmm_kernel_L4_M1_100:
SAVE1x4
@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100:
#endif
ztrmm_kernel_L4_END:
.Lztrmm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END:
#endif
subs counterJ, counterJ , #1 // j--
bgt ztrmm_kernel_L4_BEGIN
bgt .Lztrmm_kernel_L4_BEGIN
/******************************************************************************/
ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble ztrmm_kernel_L999 // error, N was less than 4?
ble .Lztrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble ztrmm_kernel_L1_BEGIN
ble .Lztrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
ztrmm_kernel_L2_M4_BEGIN:
.Lztrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble ztrmm_kernel_L2_M2_BEGIN
ble .Lztrmm_kernel_L2_M2_BEGIN
ztrmm_kernel_L2_M4_20:
.Lztrmm_kernel_L2_M4_20:
INIT4x2
@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ztrmm_kernel_L2_M4_40
ble .Lztrmm_kernel_L2_M4_40
.align 5
ztrmm_kernel_L2_M4_22:
.Lztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_22
bgt .Lztrmm_kernel_L2_M4_22
ztrmm_kernel_L2_M4_40:
.Lztrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M4_100
ble .Lztrmm_kernel_L2_M4_100
ztrmm_kernel_L2_M4_42:
.Lztrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_42
bgt .Lztrmm_kernel_L2_M4_42
ztrmm_kernel_L2_M4_100:
.Lztrmm_kernel_L2_M4_100:
SAVE4x2
@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif
ztrmm_kernel_L2_M4_END:
.Lztrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt ztrmm_kernel_L2_M4_20
bgt .Lztrmm_kernel_L2_M4_20
ztrmm_kernel_L2_M2_BEGIN:
.Lztrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L2_M1_BEGIN
ble .Lztrmm_kernel_L2_M1_BEGIN
ztrmm_kernel_L2_M2_20:
.Lztrmm_kernel_L2_M2_20:
INIT2x2
@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ztrmm_kernel_L2_M2_40
ble .Lztrmm_kernel_L2_M2_40
ztrmm_kernel_L2_M2_22:
.Lztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_22
bgt .Lztrmm_kernel_L2_M2_22
ztrmm_kernel_L2_M2_40:
.Lztrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M2_100
ble .Lztrmm_kernel_L2_M2_100
ztrmm_kernel_L2_M2_42:
.Lztrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_42
bgt .Lztrmm_kernel_L2_M2_42
ztrmm_kernel_L2_M2_100:
.Lztrmm_kernel_L2_M2_100:
SAVE2x2
@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif
ztrmm_kernel_L2_M2_END:
.Lztrmm_kernel_L2_M2_END:
ztrmm_kernel_L2_M1_BEGIN:
.Lztrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END
ztrmm_kernel_L2_M1_20:
.Lztrmm_kernel_L2_M1_20:
INIT1x2
@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ztrmm_kernel_L2_M1_40
ble .Lztrmm_kernel_L2_M1_40
ztrmm_kernel_L2_M1_22:
.Lztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_22
bgt .Lztrmm_kernel_L2_M1_22
ztrmm_kernel_L2_M1_40:
.Lztrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M1_100
ble .Lztrmm_kernel_L2_M1_100
ztrmm_kernel_L2_M1_42:
.Lztrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_42
bgt .Lztrmm_kernel_L2_M1_42
ztrmm_kernel_L2_M1_100:
.Lztrmm_kernel_L2_M1_100:
SAVE1x2
@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100:
#endif
ztrmm_kernel_L2_END:
.Lztrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END:
/******************************************************************************/
ztrmm_kernel_L1_BEGIN:
.Lztrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble ztrmm_kernel_L999 // done
ble .Lztrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN:
ztrmm_kernel_L1_M4_BEGIN:
.Lztrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ztrmm_kernel_L1_M2_BEGIN
ble .Lztrmm_kernel_L1_M2_BEGIN
ztrmm_kernel_L1_M4_20:
.Lztrmm_kernel_L1_M4_20:
INIT4x1
@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M4_40
ble .Lztrmm_kernel_L1_M4_40
.align 5
ztrmm_kernel_L1_M4_22:
.Lztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_22
bgt .Lztrmm_kernel_L1_M4_22
ztrmm_kernel_L1_M4_40:
.Lztrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M4_100
ble .Lztrmm_kernel_L1_M4_100
ztrmm_kernel_L1_M4_42:
.Lztrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_42
bgt .Lztrmm_kernel_L1_M4_42
ztrmm_kernel_L1_M4_100:
.Lztrmm_kernel_L1_M4_100:
SAVE4x1
@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif
ztrmm_kernel_L1_M4_END:
.Lztrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt ztrmm_kernel_L1_M4_20
bgt .Lztrmm_kernel_L1_M4_20
ztrmm_kernel_L1_M2_BEGIN:
.Lztrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L1_M1_BEGIN
ble .Lztrmm_kernel_L1_M1_BEGIN
ztrmm_kernel_L1_M2_20:
.Lztrmm_kernel_L1_M2_20:
INIT2x1
@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M2_40
ble .Lztrmm_kernel_L1_M2_40
ztrmm_kernel_L1_M2_22:
.Lztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_22
bgt .Lztrmm_kernel_L1_M2_22
ztrmm_kernel_L1_M2_40:
.Lztrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M2_100
ble .Lztrmm_kernel_L1_M2_100
ztrmm_kernel_L1_M2_42:
.Lztrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_42
bgt .Lztrmm_kernel_L1_M2_42
ztrmm_kernel_L1_M2_100:
.Lztrmm_kernel_L1_M2_100:
SAVE2x1
@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif
ztrmm_kernel_L1_M2_END:
.Lztrmm_kernel_L1_M2_END:
ztrmm_kernel_L1_M1_BEGIN:
.Lztrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END
ztrmm_kernel_L1_M1_20:
.Lztrmm_kernel_L1_M1_20:
INIT1x1
@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M1_40
ble .Lztrmm_kernel_L1_M1_40
ztrmm_kernel_L1_M1_22:
.Lztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_22
bgt .Lztrmm_kernel_L1_M1_22
ztrmm_kernel_L1_M1_40:
.Lztrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M1_100
ble .Lztrmm_kernel_L1_M1_100
ztrmm_kernel_L1_M1_42:
.Lztrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_42
bgt .Lztrmm_kernel_L1_M1_42
ztrmm_kernel_L1_M1_100:
.Lztrmm_kernel_L1_M1_100:
SAVE1x1
ztrmm_kernel_L1_END:
.Lztrmm_kernel_L1_END:
ztrmm_kernel_L999:
.Lztrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]