ARM64: Convert all labels to local labels

While debugging/profiling applications using perf or other tools, the
kernels appear scattered in the profile reports. This is because the labels
within the kernels are not local and each label is shown as a separate
function.

To avoid this, all the labels within the kernels are changed to local
labels.
This commit is contained in:
Ashwin Sekhar T K 2017-10-24 10:47:11 +00:00
parent 627133f9ad
commit a0128aa489
50 changed files with 4469 additions and 4469 deletions

View File

@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble amax_kernel_zero ble .Lamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble amax_kernel_zero ble .Lamax_kernel_zero
cmp INC_X, #1 cmp INC_X, #1
bne amax_kernel_S_BEGIN bne .Lamax_kernel_S_BEGIN
amax_kernel_F_BEGIN: .Lamax_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq amax_kernel_F1_INIT beq .Lamax_kernel_F1_INIT
INIT_F4 INIT_F4
subs I, I, #1 subs I, I, #1
beq amax_kernel_F1 beq .Lamax_kernel_F1
amax_kernel_F4: .Lamax_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne amax_kernel_F4 bne .Lamax_kernel_F4
amax_kernel_F1: .Lamax_kernel_F1:
ands I, N, #3 ands I, N, #3
ble amax_kernel_L999 ble .Lamax_kernel_L999
amax_kernel_F10: .Lamax_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne amax_kernel_F10 bne .Lamax_kernel_F10
ret ret
amax_kernel_F1_INIT: .Lamax_kernel_F1_INIT:
INIT_F1 INIT_F1
subs N, N, #1 subs N, N, #1
b amax_kernel_F1 b .Lamax_kernel_F1
amax_kernel_S_BEGIN: .Lamax_kernel_S_BEGIN:
INIT_S INIT_S
subs N, N, #1 subs N, N, #1
ble amax_kernel_L999 ble .Lamax_kernel_L999
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble amax_kernel_S1 ble .Lamax_kernel_S1
amax_kernel_S4: .Lamax_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -223,25 +223,25 @@ amax_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne amax_kernel_S4 bne .Lamax_kernel_S4
amax_kernel_S1: .Lamax_kernel_S1:
ands I, N, #3 ands I, N, #3
ble amax_kernel_L999 ble .Lamax_kernel_L999
amax_kernel_S10: .Lamax_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne amax_kernel_S10 bne .Lamax_kernel_S10
amax_kernel_L999: .Lamax_kernel_L999:
ret ret
amax_kernel_zero: .Lamax_kernel_zero:
fmov MAXF, REG0 fmov MAXF, REG0
ret ret

View File

@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
cmp N, xzr cmp N, xzr
ble asum_kernel_L999 ble .Lasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999 ble .Lasum_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN bne .Lasum_kernel_S_BEGIN
asum_kernel_F_BEGIN: .Lasum_kernel_F_BEGIN:
asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq asum_kernel_F1 beq .Lasum_kernel_F1
asum_kernel_F8: .Lasum_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne asum_kernel_F8 bne .Lasum_kernel_F8
KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE
asum_kernel_F1: .Lasum_kernel_F1:
ands I, N, #7 ands I, N, #7
ble asum_kernel_L999 ble .Lasum_kernel_L999
asum_kernel_F10: .Lasum_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne asum_kernel_F10 bne .Lasum_kernel_F10
asum_kernel_L999: .Lasum_kernel_L999:
ret ret
asum_kernel_S_BEGIN: .Lasum_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1 ble .Lasum_kernel_S1
asum_kernel_S4: .Lasum_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -175,19 +175,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne asum_kernel_S4 bne .Lasum_kernel_S4
asum_kernel_S1: .Lasum_kernel_S1:
ands I, N, #3 ands I, N, #3
ble asum_kernel_L999 ble .Lasum_kernel_L999
asum_kernel_S10: .Lasum_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne asum_kernel_S10 bne .Lasum_kernel_S10
ret ret

View File

@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble axpy_kernel_L999 ble .Laxpy_kernel_L999
fcmp DA, #0.0 fcmp DA, #0.0
beq axpy_kernel_L999 beq .Laxpy_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN bne .Laxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne axpy_kernel_S_BEGIN bne .Laxpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN: .Laxpy_kernel_F_BEGIN:
asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq axpy_kernel_F1 beq .Laxpy_kernel_F1
axpy_kernel_F8: .Laxpy_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne axpy_kernel_F8 bne .Laxpy_kernel_F8
axpy_kernel_F1: .Laxpy_kernel_F1:
ands I, N, #7 ands I, N, #7
ble axpy_kernel_L999 ble .Laxpy_kernel_L999
axpy_kernel_F10: .Laxpy_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne axpy_kernel_F10 bne .Laxpy_kernel_F10
mov w0, wzr mov w0, wzr
ret ret
axpy_kernel_S_BEGIN: .Laxpy_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble axpy_kernel_S1 ble .Laxpy_kernel_S1
axpy_kernel_S4: .Laxpy_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -189,21 +189,21 @@ axpy_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne axpy_kernel_S4 bne .Laxpy_kernel_S4
axpy_kernel_S1: .Laxpy_kernel_S1:
ands I, N, #3 ands I, N, #3
ble axpy_kernel_L999 ble .Laxpy_kernel_L999
axpy_kernel_S10: .Laxpy_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne axpy_kernel_S10 bne .Laxpy_kernel_S10
axpy_kernel_L999: .Laxpy_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret

View File

@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov s1, SUMF fmov s1, SUMF
cmp N, xzr cmp N, xzr
ble asum_kernel_L999 ble .Lcasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999 ble .Lcasum_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN bne .Lcasum_kernel_S_BEGIN
asum_kernel_F_BEGIN: .Lcasum_kernel_F_BEGIN:
asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq asum_kernel_F1 beq .Lcasum_kernel_F1
asum_kernel_F8: .Lcasum_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne asum_kernel_F8 bne .Lcasum_kernel_F8
KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE
asum_kernel_F1: .Lcasum_kernel_F1:
ands I, N, #7 ands I, N, #7
ble asum_kernel_L999 ble .Lcasum_kernel_L999
asum_kernel_F10: .Lcasum_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne asum_kernel_F10 bne .Lcasum_kernel_F10
asum_kernel_L999: .Lcasum_kernel_L999:
ret ret
asum_kernel_S_BEGIN: .Lcasum_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1 ble .Lcasum_kernel_S1
asum_kernel_S4: .Lcasum_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -151,19 +151,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne asum_kernel_S4 bne .Lcasum_kernel_S4
asum_kernel_S1: .Lcasum_kernel_S1:
ands I, N, #3 ands I, N, #3
ble asum_kernel_L999 ble .Lcasum_kernel_L999
asum_kernel_S10: .Lcasum_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne asum_kernel_S10 bne .Lcasum_kernel_S10
ret ret

View File

@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L4_BEGIN: .Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
add ppA, temp, pA add ppA, temp, pA
cgemm_kernel_L4_M8_BEGIN: .Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN ble .Lcgemm_kernel_L4_M4_BEGIN
cgemm_kernel_L4_M8_20: .Lcgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M8_32 blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a ble .Lcgemm_kernel_L4_M8_22a
.align 5 .align 5
cgemm_kernel_L4_M8_22: .Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22 bgt .Lcgemm_kernel_L4_M8_22
cgemm_kernel_L4_M8_22a: .Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b cgemm_kernel_L4_M8_44 b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_32: .Lcgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40 ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_E KERNEL8x4_E
b cgemm_kernel_L4_M8_44 b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40: .Lcgemm_kernel_L4_M8_40:
INIT8x4 INIT8x4
cgemm_kernel_L4_M8_44: .Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M8_100 ble .Lcgemm_kernel_L4_M8_100
cgemm_kernel_L4_M8_46: .Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
cgemm_kernel_L4_M8_100: .Lcgemm_kernel_L4_M8_100:
SAVE8x4 SAVE8x4
cgemm_kernel_L4_M8_END: .Lcgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8 lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp add pA, pA, temp
add ppA, ppA, temp add ppA, ppA, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20 bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN: .Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20: .Lcgemm_kernel_L4_M4_20:
INIT4x4 INIT4x4
mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L4_M4_40 ble .Lcgemm_kernel_L4_M4_40
cgemm_kernel_L4_M4_22: .Lcgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22 bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_40: .Lcgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M4_100 ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_42: .Lcgemm_kernel_L4_M4_42:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_42 bgt .Lcgemm_kernel_L4_M4_42
cgemm_kernel_L4_M4_100: .Lcgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
cgemm_kernel_L4_M4_END: .Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN: .Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20: .Lcgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40 ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22: .Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22 bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40: .Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100 ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42: .Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42 bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100: .Lcgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
cgemm_kernel_L4_M2_END: .Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN: .Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20: .Lcgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40 ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22: .Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22 bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40: .Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100 ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42: .Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42 bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100: .Lcgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
cgemm_kernel_L4_END: .Lcgemm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4? ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
cgemm_kernel_L2_M4_BEGIN: .Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble cgemm_kernel_L2_M2_BEGIN ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20: .Lcgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40 ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5
cgemm_kernel_L2_M4_22: .Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22 bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40: .Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100 ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42: .Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42 bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100: .Lcgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
cgemm_kernel_L2_M4_END: .Lcgemm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M4_20 bgt .Lcgemm_kernel_L2_M4_20
cgemm_kernel_L2_M2_BEGIN: .Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20: .Lcgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40 ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22: .Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22 bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40: .Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100 ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42: .Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42 bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100: .Lcgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
cgemm_kernel_L2_M2_END: .Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN: .Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20: .Lcgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40 ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22: .Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22 bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40: .Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100 ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42: .Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42 bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100: .Lcgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
cgemm_kernel_L2_END: .Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L1_BEGIN: .Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN:
cgemm_kernel_L1_M4_BEGIN: .Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M2_BEGIN ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20: .Lcgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40 ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5
cgemm_kernel_L1_M4_22: .Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22 bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40: .Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100 ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42: .Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42 bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100: .Lcgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
cgemm_kernel_L1_M4_END: .Lcgemm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M4_20 bgt .Lcgemm_kernel_L1_M4_20
cgemm_kernel_L1_M2_BEGIN: .Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20: .Lcgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40 ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22: .Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22 bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40: .Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100 ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42: .Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42 bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100: .Lcgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
cgemm_kernel_L1_M2_END: .Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN: .Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20: .Lcgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40 ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22: .Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22 bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40: .Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100 ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42: .Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42 bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100: .Lcgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
cgemm_kernel_L1_END: .Lcgemm_kernel_L1_END:
cgemm_kernel_L999: .Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L4_BEGIN: .Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
cgemm_kernel_L4_M8_BEGIN: .Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5 .align 5
cgemm_kernel_L4_M8_20: .Lcgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt cgemm_kernel_L4_M8_32 blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a ble .Lcgemm_kernel_L4_M8_22a
.align 5 .align 5
cgemm_kernel_L4_M8_22: .Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22 bgt .Lcgemm_kernel_L4_M8_22
.align 5 .align 5
cgemm_kernel_L4_M8_22a: .Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b cgemm_kernel_L4_M8_44 b .Lcgemm_kernel_L4_M8_44
.align 5 .align 5
cgemm_kernel_L4_M8_32: .Lcgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40 ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b cgemm_kernel_L4_M8_44 b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40: .Lcgemm_kernel_L4_M8_40:
INIT8x4 INIT8x4
cgemm_kernel_L4_M8_44: .Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #7 ands counterL , origK, #7
ble cgemm_kernel_L4_M8_100 ble .Lcgemm_kernel_L4_M8_100
.align 5 .align 5
cgemm_kernel_L4_M8_46: .Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46 bne .Lcgemm_kernel_L4_M8_46
cgemm_kernel_L4_M8_100: .Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
SAVE8x4 SAVE8x4
cgemm_kernel_L4_M8_END: .Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20 bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN: .Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20: .Lcgemm_kernel_L4_M4_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32 blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a ble .Lcgemm_kernel_L4_M4_22a
.align 5 .align 5
cgemm_kernel_L4_M4_22: .Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22 bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_22a: .Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44 b .Lcgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32: .Lcgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M4_40 ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44 b .Lcgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40: .Lcgemm_kernel_L4_M4_40:
INIT4x4 INIT4x4
cgemm_kernel_L4_M4_44: .Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100 ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_46: .Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
cgemm_kernel_L4_M4_100: .Lcgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
cgemm_kernel_L4_M4_END: .Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN: .Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20: .Lcgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40 ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22: .Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22 bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40: .Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100 ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42: .Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42 bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100: .Lcgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
cgemm_kernel_L4_M2_END: .Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN: .Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20: .Lcgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40 ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22: .Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22 bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40: .Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100 ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42: .Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42 bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100: .Lcgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
cgemm_kernel_L4_END: .Lcgemm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4? ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
cgemm_kernel_L2_M8_BEGIN: .Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN ble .Lcgemm_kernel_L2_M4_BEGIN
cgemm_kernel_L2_M8_20: .Lcgemm_kernel_L2_M8_20:
INIT8x2 INIT8x2
@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M8_40 ble .Lcgemm_kernel_L2_M8_40
.align 5 .align 5
cgemm_kernel_L2_M8_22: .Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22 bgt .Lcgemm_kernel_L2_M8_22
cgemm_kernel_L2_M8_40: .Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100 ble .Lcgemm_kernel_L2_M8_100
cgemm_kernel_L2_M8_42: .Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42 bgt .Lcgemm_kernel_L2_M8_42
cgemm_kernel_L2_M8_100: .Lcgemm_kernel_L2_M8_100:
SAVE8x2 SAVE8x2
cgemm_kernel_L2_M8_END: .Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20 bgt .Lcgemm_kernel_L2_M8_20
cgemm_kernel_L2_M4_BEGIN: .Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20: .Lcgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40 ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5
cgemm_kernel_L2_M4_22: .Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22 bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40: .Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100 ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42: .Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42 bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100: .Lcgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
cgemm_kernel_L2_M4_END: .Lcgemm_kernel_L2_M4_END:
cgemm_kernel_L2_M2_BEGIN: .Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20: .Lcgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40 ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22: .Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22 bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40: .Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100 ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42: .Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42 bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100: .Lcgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
cgemm_kernel_L2_M2_END: .Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN: .Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20: .Lcgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40 ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22: .Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22 bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40: .Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100 ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42: .Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42 bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100: .Lcgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
cgemm_kernel_L2_END: .Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L1_BEGIN: .Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A
cgemm_kernel_L1_M8_BEGIN: .Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN ble .Lcgemm_kernel_L1_M4_BEGIN
cgemm_kernel_L1_M8_20: .Lcgemm_kernel_L1_M8_20:
INIT8x1 INIT8x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M8_40 ble .Lcgemm_kernel_L1_M8_40
.align 5 .align 5
cgemm_kernel_L1_M8_22: .Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22 bgt .Lcgemm_kernel_L1_M8_22
cgemm_kernel_L1_M8_40: .Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100 ble .Lcgemm_kernel_L1_M8_100
cgemm_kernel_L1_M8_42: .Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42 bgt .Lcgemm_kernel_L1_M8_42
cgemm_kernel_L1_M8_100: .Lcgemm_kernel_L1_M8_100:
SAVE8x1 SAVE8x1
cgemm_kernel_L1_M8_END: .Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20 bgt .Lcgemm_kernel_L1_M8_20
cgemm_kernel_L1_M4_BEGIN: .Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20: .Lcgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40 ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5
cgemm_kernel_L1_M4_22: .Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22 bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40: .Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100 ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42: .Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42 bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100: .Lcgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
cgemm_kernel_L1_M4_END: .Lcgemm_kernel_L1_M4_END:
cgemm_kernel_L1_M2_BEGIN: .Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20: .Lcgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40 ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22: .Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22 bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40: .Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100 ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42: .Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42 bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100: .Lcgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
cgemm_kernel_L1_M2_END: .Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN: .Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20: .Lcgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40 ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22: .Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22 bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40: .Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100 ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42: .Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42 bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100: .Lcgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
cgemm_kernel_L1_END: .Lcgemm_kernel_L1_END:
cgemm_kernel_L999: .Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L4_BEGIN: .Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
cgemm_kernel_L4_M8_BEGIN: .Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5 .align 5
cgemm_kernel_L4_M8_20: .Lcgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #5 // origK / 32 asr counterL , origK, #5 // origK / 32
cmp counterL , #2 cmp counterL , #2
blt cgemm_kernel_L4_M8_32 blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a ble .Lcgemm_kernel_L4_M8_22a
.align 5 .align 5
cgemm_kernel_L4_M8_22: .Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x16
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22 bgt .Lcgemm_kernel_L4_M8_22
.align 5 .align 5
cgemm_kernel_L4_M8_22a: .Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8
KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x4
@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b cgemm_kernel_L4_M8_44 b .Lcgemm_kernel_L4_M8_44
.align 5 .align 5
cgemm_kernel_L4_M8_32: .Lcgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40 ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b cgemm_kernel_L4_M8_44 b .Lcgemm_kernel_L4_M8_44
cgemm_kernel_L4_M8_40: .Lcgemm_kernel_L4_M8_40:
INIT8x4 INIT8x4
cgemm_kernel_L4_M8_44: .Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #31 ands counterL , origK, #31
ble cgemm_kernel_L4_M8_100 ble .Lcgemm_kernel_L4_M8_100
.align 5 .align 5
cgemm_kernel_L4_M8_46: .Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46 bne .Lcgemm_kernel_L4_M8_46
cgemm_kernel_L4_M8_100: .Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
SAVE8x4 SAVE8x4
cgemm_kernel_L4_M8_END: .Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20 bne .Lcgemm_kernel_L4_M8_20
cgemm_kernel_L4_M4_BEGIN: .Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN ble .Lcgemm_kernel_L4_M2_BEGIN
cgemm_kernel_L4_M4_20: .Lcgemm_kernel_L4_M4_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32 blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a ble .Lcgemm_kernel_L4_M4_22a
.align 5 .align 5
cgemm_kernel_L4_M4_22: .Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22 bgt .Lcgemm_kernel_L4_M4_22
cgemm_kernel_L4_M4_22a: .Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44 b .Lcgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32: .Lcgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M4_40 ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44 b .Lcgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40: .Lcgemm_kernel_L4_M4_40:
INIT4x4 INIT4x4
cgemm_kernel_L4_M4_44: .Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100 ble .Lcgemm_kernel_L4_M4_100
cgemm_kernel_L4_M4_46: .Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
cgemm_kernel_L4_M4_100: .Lcgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
cgemm_kernel_L4_M4_END: .Lcgemm_kernel_L4_M4_END:
cgemm_kernel_L4_M2_BEGIN: .Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN ble .Lcgemm_kernel_L4_M1_BEGIN
cgemm_kernel_L4_M2_20: .Lcgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40 ble .Lcgemm_kernel_L4_M2_40
cgemm_kernel_L4_M2_22: .Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22 bgt .Lcgemm_kernel_L4_M2_22
cgemm_kernel_L4_M2_40: .Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100 ble .Lcgemm_kernel_L4_M2_100
cgemm_kernel_L4_M2_42: .Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42 bgt .Lcgemm_kernel_L4_M2_42
cgemm_kernel_L4_M2_100: .Lcgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
cgemm_kernel_L4_M2_END: .Lcgemm_kernel_L4_M2_END:
cgemm_kernel_L4_M1_BEGIN: .Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END ble .Lcgemm_kernel_L4_END
cgemm_kernel_L4_M1_20: .Lcgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40 ble .Lcgemm_kernel_L4_M1_40
cgemm_kernel_L4_M1_22: .Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22 bgt .Lcgemm_kernel_L4_M1_22
cgemm_kernel_L4_M1_40: .Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100 ble .Lcgemm_kernel_L4_M1_100
cgemm_kernel_L4_M1_42: .Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42 bgt .Lcgemm_kernel_L4_M1_42
cgemm_kernel_L4_M1_100: .Lcgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
cgemm_kernel_L4_END: .Lcgemm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4? ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
cgemm_kernel_L2_M8_BEGIN: .Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN ble .Lcgemm_kernel_L2_M4_BEGIN
cgemm_kernel_L2_M8_20: .Lcgemm_kernel_L2_M8_20:
INIT8x2 INIT8x2
@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M8_40 ble .Lcgemm_kernel_L2_M8_40
.align 5 .align 5
cgemm_kernel_L2_M8_22: .Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22 bgt .Lcgemm_kernel_L2_M8_22
cgemm_kernel_L2_M8_40: .Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100 ble .Lcgemm_kernel_L2_M8_100
cgemm_kernel_L2_M8_42: .Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42 bgt .Lcgemm_kernel_L2_M8_42
cgemm_kernel_L2_M8_100: .Lcgemm_kernel_L2_M8_100:
SAVE8x2 SAVE8x2
cgemm_kernel_L2_M8_END: .Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20 bgt .Lcgemm_kernel_L2_M8_20
cgemm_kernel_L2_M4_BEGIN: .Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN ble .Lcgemm_kernel_L2_M2_BEGIN
cgemm_kernel_L2_M4_20: .Lcgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40 ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5
cgemm_kernel_L2_M4_22: .Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22 bgt .Lcgemm_kernel_L2_M4_22
cgemm_kernel_L2_M4_40: .Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100 ble .Lcgemm_kernel_L2_M4_100
cgemm_kernel_L2_M4_42: .Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42 bgt .Lcgemm_kernel_L2_M4_42
cgemm_kernel_L2_M4_100: .Lcgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
cgemm_kernel_L2_M4_END: .Lcgemm_kernel_L2_M4_END:
cgemm_kernel_L2_M2_BEGIN: .Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN ble .Lcgemm_kernel_L2_M1_BEGIN
cgemm_kernel_L2_M2_20: .Lcgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40 ble .Lcgemm_kernel_L2_M2_40
cgemm_kernel_L2_M2_22: .Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22 bgt .Lcgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_40: .Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100 ble .Lcgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_42: .Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42 bgt .Lcgemm_kernel_L2_M2_42
cgemm_kernel_L2_M2_100: .Lcgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
cgemm_kernel_L2_M2_END: .Lcgemm_kernel_L2_M2_END:
cgemm_kernel_L2_M1_BEGIN: .Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END ble .Lcgemm_kernel_L2_END
cgemm_kernel_L2_M1_20: .Lcgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40 ble .Lcgemm_kernel_L2_M1_40
cgemm_kernel_L2_M1_22: .Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22 bgt .Lcgemm_kernel_L2_M1_22
cgemm_kernel_L2_M1_40: .Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100 ble .Lcgemm_kernel_L2_M1_100
cgemm_kernel_L2_M1_42: .Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42 bgt .Lcgemm_kernel_L2_M1_42
cgemm_kernel_L2_M1_100: .Lcgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
cgemm_kernel_L2_END: .Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
cgemm_kernel_L1_BEGIN: .Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A
cgemm_kernel_L1_M8_BEGIN: .Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN ble .Lcgemm_kernel_L1_M4_BEGIN
cgemm_kernel_L1_M8_20: .Lcgemm_kernel_L1_M8_20:
INIT8x1 INIT8x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M8_40 ble .Lcgemm_kernel_L1_M8_40
.align 5 .align 5
cgemm_kernel_L1_M8_22: .Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22 bgt .Lcgemm_kernel_L1_M8_22
cgemm_kernel_L1_M8_40: .Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100 ble .Lcgemm_kernel_L1_M8_100
cgemm_kernel_L1_M8_42: .Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42 bgt .Lcgemm_kernel_L1_M8_42
cgemm_kernel_L1_M8_100: .Lcgemm_kernel_L1_M8_100:
SAVE8x1 SAVE8x1
cgemm_kernel_L1_M8_END: .Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20 bgt .Lcgemm_kernel_L1_M8_20
cgemm_kernel_L1_M4_BEGIN: .Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN ble .Lcgemm_kernel_L1_M2_BEGIN
cgemm_kernel_L1_M4_20: .Lcgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40 ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5
cgemm_kernel_L1_M4_22: .Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22 bgt .Lcgemm_kernel_L1_M4_22
cgemm_kernel_L1_M4_40: .Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100 ble .Lcgemm_kernel_L1_M4_100
cgemm_kernel_L1_M4_42: .Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42 bgt .Lcgemm_kernel_L1_M4_42
cgemm_kernel_L1_M4_100: .Lcgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
cgemm_kernel_L1_M4_END: .Lcgemm_kernel_L1_M4_END:
cgemm_kernel_L1_M2_BEGIN: .Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN ble .Lcgemm_kernel_L1_M1_BEGIN
cgemm_kernel_L1_M2_20: .Lcgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40 ble .Lcgemm_kernel_L1_M2_40
cgemm_kernel_L1_M2_22: .Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22 bgt .Lcgemm_kernel_L1_M2_22
cgemm_kernel_L1_M2_40: .Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100 ble .Lcgemm_kernel_L1_M2_100
cgemm_kernel_L1_M2_42: .Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42 bgt .Lcgemm_kernel_L1_M2_42
cgemm_kernel_L1_M2_100: .Lcgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
cgemm_kernel_L1_M2_END: .Lcgemm_kernel_L1_M2_END:
cgemm_kernel_L1_M1_BEGIN: .Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END ble .Lcgemm_kernel_L1_END
cgemm_kernel_L1_M1_20: .Lcgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40 ble .Lcgemm_kernel_L1_M1_40
cgemm_kernel_L1_M1_22: .Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22 bgt .Lcgemm_kernel_L1_M1_22
cgemm_kernel_L1_M1_40: .Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100 ble .Lcgemm_kernel_L1_M1_100
cgemm_kernel_L1_M1_42: .Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42 bgt .Lcgemm_kernel_L1_M1_42
cgemm_kernel_L1_M1_100: .Lcgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
cgemm_kernel_L1_END: .Lcgemm_kernel_L1_END:
cgemm_kernel_L999: .Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble copy_kernel_L999 ble .Lcopy_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne copy_kernel_S_BEGIN bne .Lcopy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne copy_kernel_S_BEGIN bne .Lcopy_kernel_S_BEGIN
copy_kernel_F_BEGIN: .Lcopy_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq copy_kernel_F1 beq .Lcopy_kernel_F1
copy_kernel_F4: .Lcopy_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne copy_kernel_F4 bne .Lcopy_kernel_F4
copy_kernel_F1: .Lcopy_kernel_F1:
ands I, N, #3 ands I, N, #3
ble copy_kernel_L999 ble .Lcopy_kernel_L999
copy_kernel_F10: .Lcopy_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne copy_kernel_F10 bne .Lcopy_kernel_F10
mov w0, wzr mov w0, wzr
ret ret
copy_kernel_S_BEGIN: .Lcopy_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble copy_kernel_S1 ble .Lcopy_kernel_S1
copy_kernel_S4: .Lcopy_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -210,21 +210,21 @@ copy_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne copy_kernel_S4 bne .Lcopy_kernel_S4
copy_kernel_S1: .Lcopy_kernel_S1:
ands I, N, #3 ands I, N, #3
ble copy_kernel_L999 ble .Lcopy_kernel_L999
copy_kernel_S10: .Lcopy_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne copy_kernel_S10 bne .Lcopy_kernel_S10
copy_kernel_L999: .Lcopy_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret

View File

@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
ctrmm_kernel_L4_BEGIN: .Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
ctrmm_kernel_L4_M4_BEGIN: .Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L4_M2_BEGIN ble .Lctrmm_kernel_L4_M2_BEGIN
ctrmm_kernel_L4_M4_20: .Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32 blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a ble .Lctrmm_kernel_L4_M4_22a
.align 5 .align 5
ctrmm_kernel_L4_M4_22: .Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22 bgt .Lctrmm_kernel_L4_M4_22
ctrmm_kernel_L4_M4_22a: .Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44 b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32: .Lctrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M4_40 ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44 b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40: .Lctrmm_kernel_L4_M4_40:
INIT4x4 INIT4x4
ctrmm_kernel_L4_M4_44: .Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1 ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100 ble .Lctrmm_kernel_L4_M4_100
ctrmm_kernel_L4_M4_46: .Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
ctrmm_kernel_L4_M4_100: .Lctrmm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ctrmm_kernel_L4_M4_END: .Lctrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ctrmm_kernel_L4_M4_20 bne .Lctrmm_kernel_L4_M4_20
ctrmm_kernel_L4_M2_BEGIN: .Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L4_END ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN ble .Lctrmm_kernel_L4_M1_BEGIN
ctrmm_kernel_L4_M2_20: .Lctrmm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M2_40 ble .Lctrmm_kernel_L4_M2_40
ctrmm_kernel_L4_M2_22: .Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22 bgt .Lctrmm_kernel_L4_M2_22
ctrmm_kernel_L4_M2_40: .Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100 ble .Lctrmm_kernel_L4_M2_100
ctrmm_kernel_L4_M2_42: .Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42 bgt .Lctrmm_kernel_L4_M2_42
ctrmm_kernel_L4_M2_100: .Lctrmm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ctrmm_kernel_L4_M2_END: .Lctrmm_kernel_L4_M2_END:
ctrmm_kernel_L4_M1_BEGIN: .Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END ble .Lctrmm_kernel_L4_END
ctrmm_kernel_L4_M1_20: .Lctrmm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M1_40 ble .Lctrmm_kernel_L4_M1_40
ctrmm_kernel_L4_M1_22: .Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22 bgt .Lctrmm_kernel_L4_M1_22
ctrmm_kernel_L4_M1_40: .Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100 ble .Lctrmm_kernel_L4_M1_100
ctrmm_kernel_L4_M1_42: .Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42 bgt .Lctrmm_kernel_L4_M1_42
ctrmm_kernel_L4_M1_100: .Lctrmm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
ctrmm_kernel_L4_END: .Lctrmm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END:
#endif #endif
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction .Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4? ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
ctrmm_kernel_L2_M4_BEGIN: .Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble ctrmm_kernel_L2_M2_BEGIN ble .Lctrmm_kernel_L2_M2_BEGIN
ctrmm_kernel_L2_M4_20: .Lctrmm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M4_40 ble .Lctrmm_kernel_L2_M4_40
.align 5 .align 5
ctrmm_kernel_L2_M4_22: .Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22 bgt .Lctrmm_kernel_L2_M4_22
ctrmm_kernel_L2_M4_40: .Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100 ble .Lctrmm_kernel_L2_M4_100
ctrmm_kernel_L2_M4_42: .Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42 bgt .Lctrmm_kernel_L2_M4_42
ctrmm_kernel_L2_M4_100: .Lctrmm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ctrmm_kernel_L2_M4_END: .Lctrmm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M4_20 bgt .Lctrmm_kernel_L2_M4_20
ctrmm_kernel_L2_M2_BEGIN: .Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L2_END ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN ble .Lctrmm_kernel_L2_M1_BEGIN
ctrmm_kernel_L2_M2_20: .Lctrmm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M2_40 ble .Lctrmm_kernel_L2_M2_40
ctrmm_kernel_L2_M2_22: .Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22 bgt .Lctrmm_kernel_L2_M2_22
ctrmm_kernel_L2_M2_40: .Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100 ble .Lctrmm_kernel_L2_M2_100
ctrmm_kernel_L2_M2_42: .Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42 bgt .Lctrmm_kernel_L2_M2_42
ctrmm_kernel_L2_M2_100: .Lctrmm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ctrmm_kernel_L2_M2_END: .Lctrmm_kernel_L2_M2_END:
ctrmm_kernel_L2_M1_BEGIN: .Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END ble .Lctrmm_kernel_L2_END
ctrmm_kernel_L2_M1_20: .Lctrmm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ctrmm_kernel_L2_M1_40 ble .Lctrmm_kernel_L2_M1_40
ctrmm_kernel_L2_M1_22: .Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22 bgt .Lctrmm_kernel_L2_M1_22
ctrmm_kernel_L2_M1_40: .Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100 ble .Lctrmm_kernel_L2_M1_100
ctrmm_kernel_L2_M1_42: .Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42 bgt .Lctrmm_kernel_L2_M1_42
ctrmm_kernel_L2_M1_100: .Lctrmm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
ctrmm_kernel_L2_END: .Lctrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END:
/******************************************************************************/ /******************************************************************************/
ctrmm_kernel_L1_BEGIN: .Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ctrmm_kernel_L999 // done ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A
ctrmm_kernel_L1_M4_BEGIN: .Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L1_M2_BEGIN ble .Lctrmm_kernel_L1_M2_BEGIN
ctrmm_kernel_L1_M4_20: .Lctrmm_kernel_L1_M4_20:
INIT4x1 INIT4x1
@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M4_40 ble .Lctrmm_kernel_L1_M4_40
.align 5 .align 5
ctrmm_kernel_L1_M4_22: .Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22 bgt .Lctrmm_kernel_L1_M4_22
ctrmm_kernel_L1_M4_40: .Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100 ble .Lctrmm_kernel_L1_M4_100
ctrmm_kernel_L1_M4_42: .Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42 bgt .Lctrmm_kernel_L1_M4_42
ctrmm_kernel_L1_M4_100: .Lctrmm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ctrmm_kernel_L1_M4_END: .Lctrmm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M4_20 bgt .Lctrmm_kernel_L1_M4_20
ctrmm_kernel_L1_M2_BEGIN: .Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L1_END ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN ble .Lctrmm_kernel_L1_M1_BEGIN
ctrmm_kernel_L1_M2_20: .Lctrmm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M2_40 ble .Lctrmm_kernel_L1_M2_40
ctrmm_kernel_L1_M2_22: .Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22 bgt .Lctrmm_kernel_L1_M2_22
ctrmm_kernel_L1_M2_40: .Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100 ble .Lctrmm_kernel_L1_M2_100
ctrmm_kernel_L1_M2_42: .Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42 bgt .Lctrmm_kernel_L1_M2_42
ctrmm_kernel_L1_M2_100: .Lctrmm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ctrmm_kernel_L1_M2_END: .Lctrmm_kernel_L1_M2_END:
ctrmm_kernel_L1_M1_BEGIN: .Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END ble .Lctrmm_kernel_L1_END
ctrmm_kernel_L1_M1_20: .Lctrmm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M1_40 ble .Lctrmm_kernel_L1_M1_40
ctrmm_kernel_L1_M1_22: .Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22 bgt .Lctrmm_kernel_L1_M1_22
ctrmm_kernel_L1_M1_40: .Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100 ble .Lctrmm_kernel_L1_M1_100
ctrmm_kernel_L1_M1_42: .Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42 bgt .Lctrmm_kernel_L1_M1_42
ctrmm_kernel_L1_M1_100: .Lctrmm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
ctrmm_kernel_L1_END: .Lctrmm_kernel_L1_END:
ctrmm_kernel_L999: .Lctrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
ctrmm_kernel_L4_BEGIN: .Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
ctrmm_kernel_L4_M8_BEGIN: .Lctrmm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L4_M4_BEGIN ble .Lctrmm_kernel_L4_M4_BEGIN
ctrmm_kernel_L4_M8_20: .Lctrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20:
asr counterL , tempK, #3 asr counterL , tempK, #3
cmp counterL , #2 cmp counterL , #2
blt ctrmm_kernel_L4_M8_32 blt .Lctrmm_kernel_L4_M8_32
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble ctrmm_kernel_L4_M8_22a ble .Lctrmm_kernel_L4_M8_22a
.align 5 .align 5
ctrmm_kernel_L4_M8_22: .Lctrmm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M8_22 bgt .Lctrmm_kernel_L4_M8_22
.align 5 .align 5
ctrmm_kernel_L4_M8_22a: .Lctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b ctrmm_kernel_L4_M8_44 b .Lctrmm_kernel_L4_M8_44
.align 5 .align 5
ctrmm_kernel_L4_M8_32: .Lctrmm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M8_40 ble .Lctrmm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b ctrmm_kernel_L4_M8_44 b .Lctrmm_kernel_L4_M8_44
ctrmm_kernel_L4_M8_40: .Lctrmm_kernel_L4_M8_40:
INIT8x4 INIT8x4
ctrmm_kernel_L4_M8_44: .Lctrmm_kernel_L4_M8_44:
ands counterL , tempK, #7 ands counterL , tempK, #7
ble ctrmm_kernel_L4_M8_100 ble .Lctrmm_kernel_L4_M8_100
.align 5 .align 5
ctrmm_kernel_L4_M8_46: .Lctrmm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne ctrmm_kernel_L4_M8_46 bne .Lctrmm_kernel_L4_M8_46
ctrmm_kernel_L4_M8_100: .Lctrmm_kernel_L4_M8_100:
SAVE8x4 SAVE8x4
@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
ctrmm_kernel_L4_M8_END: .Lctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ctrmm_kernel_L4_M8_20 bne .Lctrmm_kernel_L4_M8_20
ctrmm_kernel_L4_M4_BEGIN: .Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L4_END ble .Lctrmm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble ctrmm_kernel_L4_M2_BEGIN ble .Lctrmm_kernel_L4_M2_BEGIN
ctrmm_kernel_L4_M4_20: .Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32 blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a ble .Lctrmm_kernel_L4_M4_22a
.align 5 .align 5
ctrmm_kernel_L4_M4_22: .Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22 bgt .Lctrmm_kernel_L4_M4_22
ctrmm_kernel_L4_M4_22a: .Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44 b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32: .Lctrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M4_40 ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44 b .Lctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40: .Lctrmm_kernel_L4_M4_40:
INIT4x4 INIT4x4
ctrmm_kernel_L4_M4_44: .Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1 ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100 ble .Lctrmm_kernel_L4_M4_100
ctrmm_kernel_L4_M4_46: .Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
ctrmm_kernel_L4_M4_100: .Lctrmm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ctrmm_kernel_L4_M4_END: .Lctrmm_kernel_L4_M4_END:
ctrmm_kernel_L4_M2_BEGIN: .Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L4_END ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN ble .Lctrmm_kernel_L4_M1_BEGIN
ctrmm_kernel_L4_M2_20: .Lctrmm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M2_40 ble .Lctrmm_kernel_L4_M2_40
ctrmm_kernel_L4_M2_22: .Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22 bgt .Lctrmm_kernel_L4_M2_22
ctrmm_kernel_L4_M2_40: .Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100 ble .Lctrmm_kernel_L4_M2_100
ctrmm_kernel_L4_M2_42: .Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42 bgt .Lctrmm_kernel_L4_M2_42
ctrmm_kernel_L4_M2_100: .Lctrmm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ctrmm_kernel_L4_M2_END: .Lctrmm_kernel_L4_M2_END:
ctrmm_kernel_L4_M1_BEGIN: .Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END ble .Lctrmm_kernel_L4_END
ctrmm_kernel_L4_M1_20: .Lctrmm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M1_40 ble .Lctrmm_kernel_L4_M1_40
ctrmm_kernel_L4_M1_22: .Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22 bgt .Lctrmm_kernel_L4_M1_22
ctrmm_kernel_L4_M1_40: .Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100 ble .Lctrmm_kernel_L4_M1_100
ctrmm_kernel_L4_M1_42: .Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42 bgt .Lctrmm_kernel_L4_M1_42
ctrmm_kernel_L4_M1_100: .Lctrmm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
ctrmm_kernel_L4_END: .Lctrmm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END:
#endif #endif
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction .Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4? ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A
ctrmm_kernel_L2_M8_BEGIN: .Lctrmm_kernel_L2_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L2_M4_BEGIN ble .Lctrmm_kernel_L2_M4_BEGIN
ctrmm_kernel_L2_M8_20: .Lctrmm_kernel_L2_M8_20:
INIT8x2 INIT8x2
@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M8_40 ble .Lctrmm_kernel_L2_M8_40
.align 5 .align 5
ctrmm_kernel_L2_M8_22: .Lctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_22 bgt .Lctrmm_kernel_L2_M8_22
ctrmm_kernel_L2_M8_40: .Lctrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M8_100 ble .Lctrmm_kernel_L2_M8_100
ctrmm_kernel_L2_M8_42: .Lctrmm_kernel_L2_M8_42:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_42 bgt .Lctrmm_kernel_L2_M8_42
ctrmm_kernel_L2_M8_100: .Lctrmm_kernel_L2_M8_100:
SAVE8x2 SAVE8x2
@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif
ctrmm_kernel_L2_M8_END: .Lctrmm_kernel_L2_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M8_20 bgt .Lctrmm_kernel_L2_M8_20
ctrmm_kernel_L2_M4_BEGIN: .Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L2_END ble .Lctrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L2_M2_BEGIN ble .Lctrmm_kernel_L2_M2_BEGIN
ctrmm_kernel_L2_M4_20: .Lctrmm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M4_40 ble .Lctrmm_kernel_L2_M4_40
.align 5 .align 5
ctrmm_kernel_L2_M4_22: .Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22 bgt .Lctrmm_kernel_L2_M4_22
ctrmm_kernel_L2_M4_40: .Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100 ble .Lctrmm_kernel_L2_M4_100
ctrmm_kernel_L2_M4_42: .Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42 bgt .Lctrmm_kernel_L2_M4_42
ctrmm_kernel_L2_M4_100: .Lctrmm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ctrmm_kernel_L2_M4_END: .Lctrmm_kernel_L2_M4_END:
ctrmm_kernel_L2_M2_BEGIN: .Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L2_END ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN ble .Lctrmm_kernel_L2_M1_BEGIN
ctrmm_kernel_L2_M2_20: .Lctrmm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M2_40 ble .Lctrmm_kernel_L2_M2_40
ctrmm_kernel_L2_M2_22: .Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22 bgt .Lctrmm_kernel_L2_M2_22
ctrmm_kernel_L2_M2_40: .Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100 ble .Lctrmm_kernel_L2_M2_100
ctrmm_kernel_L2_M2_42: .Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42 bgt .Lctrmm_kernel_L2_M2_42
ctrmm_kernel_L2_M2_100: .Lctrmm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ctrmm_kernel_L2_M2_END: .Lctrmm_kernel_L2_M2_END:
ctrmm_kernel_L2_M1_BEGIN: .Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END ble .Lctrmm_kernel_L2_END
ctrmm_kernel_L2_M1_20: .Lctrmm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ctrmm_kernel_L2_M1_40 ble .Lctrmm_kernel_L2_M1_40
ctrmm_kernel_L2_M1_22: .Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22 bgt .Lctrmm_kernel_L2_M1_22
ctrmm_kernel_L2_M1_40: .Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100 ble .Lctrmm_kernel_L2_M1_100
ctrmm_kernel_L2_M1_42: .Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42 bgt .Lctrmm_kernel_L2_M1_42
ctrmm_kernel_L2_M1_100: .Lctrmm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
ctrmm_kernel_L2_END: .Lctrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END:
/******************************************************************************/ /******************************************************************************/
ctrmm_kernel_L1_BEGIN: .Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ctrmm_kernel_L999 // done ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A
ctrmm_kernel_L1_M8_BEGIN: .Lctrmm_kernel_L1_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L1_M4_BEGIN ble .Lctrmm_kernel_L1_M4_BEGIN
ctrmm_kernel_L1_M8_20: .Lctrmm_kernel_L1_M8_20:
INIT8x1 INIT8x1
@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M8_40 ble .Lctrmm_kernel_L1_M8_40
.align 5 .align 5
ctrmm_kernel_L1_M8_22: .Lctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_22 bgt .Lctrmm_kernel_L1_M8_22
ctrmm_kernel_L1_M8_40: .Lctrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M8_100 ble .Lctrmm_kernel_L1_M8_100
ctrmm_kernel_L1_M8_42: .Lctrmm_kernel_L1_M8_42:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_42 bgt .Lctrmm_kernel_L1_M8_42
ctrmm_kernel_L1_M8_100: .Lctrmm_kernel_L1_M8_100:
SAVE8x1 SAVE8x1
@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif
ctrmm_kernel_L1_M8_END: .Lctrmm_kernel_L1_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M8_20 bgt .Lctrmm_kernel_L1_M8_20
ctrmm_kernel_L1_M4_BEGIN: .Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L1_END ble .Lctrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L1_M2_BEGIN ble .Lctrmm_kernel_L1_M2_BEGIN
ctrmm_kernel_L1_M4_20: .Lctrmm_kernel_L1_M4_20:
INIT4x1 INIT4x1
@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M4_40 ble .Lctrmm_kernel_L1_M4_40
.align 5 .align 5
ctrmm_kernel_L1_M4_22: .Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22 bgt .Lctrmm_kernel_L1_M4_22
ctrmm_kernel_L1_M4_40: .Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100 ble .Lctrmm_kernel_L1_M4_100
ctrmm_kernel_L1_M4_42: .Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42 bgt .Lctrmm_kernel_L1_M4_42
ctrmm_kernel_L1_M4_100: .Lctrmm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ctrmm_kernel_L1_M4_END: .Lctrmm_kernel_L1_M4_END:
ctrmm_kernel_L1_M2_BEGIN: .Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L1_END ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN ble .Lctrmm_kernel_L1_M1_BEGIN
ctrmm_kernel_L1_M2_20: .Lctrmm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M2_40 ble .Lctrmm_kernel_L1_M2_40
ctrmm_kernel_L1_M2_22: .Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22 bgt .Lctrmm_kernel_L1_M2_22
ctrmm_kernel_L1_M2_40: .Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100 ble .Lctrmm_kernel_L1_M2_100
ctrmm_kernel_L1_M2_42: .Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42 bgt .Lctrmm_kernel_L1_M2_42
ctrmm_kernel_L1_M2_100: .Lctrmm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ctrmm_kernel_L1_M2_END: .Lctrmm_kernel_L1_M2_END:
ctrmm_kernel_L1_M1_BEGIN: .Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END ble .Lctrmm_kernel_L1_END
ctrmm_kernel_L1_M1_20: .Lctrmm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M1_40 ble .Lctrmm_kernel_L1_M1_40
ctrmm_kernel_L1_M1_22: .Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22 bgt .Lctrmm_kernel_L1_M1_22
ctrmm_kernel_L1_M1_40: .Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100 ble .Lctrmm_kernel_L1_M1_100
ctrmm_kernel_L1_M1_42: .Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42 bgt .Lctrmm_kernel_L1_M1_42
ctrmm_kernel_L1_M1_100: .Lctrmm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
ctrmm_kernel_L1_END: .Lctrmm_kernel_L1_END:
ctrmm_kernel_L999: .Lctrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble axpy_kernel_L999 ble .Ldaxpy_kernel_L999
fcmp DA, #0.0 fcmp DA, #0.0
beq axpy_kernel_L999 beq .Ldaxpy_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN bne .Ldaxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne axpy_kernel_S_BEGIN bne .Ldaxpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN: .Ldaxpy_kernel_F_BEGIN:
asr I, N, #5 asr I, N, #5
cmp I, xzr cmp I, xzr
beq axpy_kernel_F1 beq .Ldaxpy_kernel_F1
.align 5 .align 5
axpy_kernel_F32: .Ldaxpy_kernel_F32:
KERNEL_F32 KERNEL_F32
subs I, I, #1 subs I, I, #1
bne axpy_kernel_F32 bne .Ldaxpy_kernel_F32
axpy_kernel_F1: .Ldaxpy_kernel_F1:
ands I, N, #31 ands I, N, #31
ble axpy_kernel_L999 ble .Ldaxpy_kernel_L999
axpy_kernel_F10: .Ldaxpy_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne axpy_kernel_F10 bne .Ldaxpy_kernel_F10
b axpy_kernel_L999 b .Ldaxpy_kernel_L999
axpy_kernel_S_BEGIN: .Ldaxpy_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble axpy_kernel_S1 ble .Ldaxpy_kernel_S1
axpy_kernel_S4: .Ldaxpy_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -176,21 +176,21 @@ axpy_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne axpy_kernel_S4 bne .Ldaxpy_kernel_S4
axpy_kernel_S1: .Ldaxpy_kernel_S1:
ands I, N, #3 ands I, N, #3
ble axpy_kernel_L999 ble .Ldaxpy_kernel_L999
axpy_kernel_S10: .Ldaxpy_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne axpy_kernel_S10 bne .Ldaxpy_kernel_S10
axpy_kernel_L999: .Ldaxpy_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret

View File

@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN ble .Ldgemm_kernel_L2_BEGIN
dgemm_kernel_L4_BEGIN: .Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
dgemm_kernel_L4_M8_BEGIN: .Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5 .align 5
dgemm_kernel_L4_M8_20: .Ldgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #2 // L = K / 4 asr counterL , origK, #2 // L = K / 4
cmp counterL , #2 cmp counterL , #2
blt dgemm_kernel_L4_M8_32 blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a ble .Ldgemm_kernel_L4_M8_22a
.align 5 .align 5
dgemm_kernel_L4_M8_22: .Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22 bgt .Ldgemm_kernel_L4_M8_22
.align 5 .align 5
dgemm_kernel_L4_M8_22a: .Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b .Ldgemm_kernel_L4_M8_44
.align 5 .align 5
dgemm_kernel_L4_M8_32: .Ldgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40 ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40: .Ldgemm_kernel_L4_M8_40:
INIT8x4 INIT8x4
dgemm_kernel_L4_M8_44: .Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #3 ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100 ble .Ldgemm_kernel_L4_M8_100
.align 5 .align 5
dgemm_kernel_L4_M8_46: .Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46 bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100: .Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5 lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp] prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp] prfm PLDL1KEEP, [ppA, temp]
@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:
SAVE8x4 SAVE8x4
dgemm_kernel_L4_M8_END: .Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8 lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp add pA, pA, temp
add ppA, ppA, temp add ppA, ppA, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20 bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN: .Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20: .Ldgemm_kernel_L4_M4_20:
INIT4x4 INIT4x4
mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L4_M4_40 ble .Ldgemm_kernel_L4_M4_40
dgemm_kernel_L4_M4_22: .Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22 bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40: .Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100 ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42: .Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42 bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100: .Ldgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
dgemm_kernel_L4_M4_END: .Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN: .Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20: .Ldgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40 ble .Ldgemm_kernel_L4_M2_40
dgemm_kernel_L4_M2_22: .Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22 bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40: .Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100 ble .Ldgemm_kernel_L4_M2_100
dgemm_kernel_L4_M2_42: .Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42 bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100: .Ldgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
dgemm_kernel_L4_M2_END: .Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN: .Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20: .Ldgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40 ble .Ldgemm_kernel_L4_M1_40
dgemm_kernel_L4_M1_22: .Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22 bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40: .Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100 ble .Ldgemm_kernel_L4_M1_100
dgemm_kernel_L4_M1_42: .Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42 bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100: .Ldgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
dgemm_kernel_L4_END: .Ldgemm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4? ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
dgemm_kernel_L2_M4_BEGIN: .Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20: .Ldgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40 ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5
dgemm_kernel_L2_M4_22: .Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22 bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40: .Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100 ble .Ldgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42: .Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42 bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100: .Ldgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
dgemm_kernel_L2_M4_END: .Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20 bgt .Ldgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN: .Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20: .Ldgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40 ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22: .Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22 bgt .Ldgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40: .Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100 ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42: .Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42 bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100: .Ldgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
dgemm_kernel_L2_M2_END: .Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN: .Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20: .Ldgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40 ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22: .Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22 bgt .Ldgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40: .Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100 ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42: .Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42 bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100: .Ldgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
dgemm_kernel_L2_END: .Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L1_BEGIN: .Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:
dgemm_kernel_L1_M4_BEGIN: .Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20: .Ldgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40 ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5
dgemm_kernel_L1_M4_22: .Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22 bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40: .Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100 ble .Ldgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42: .Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42 bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100: .Ldgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
dgemm_kernel_L1_M4_END: .Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20 bgt .Ldgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN: .Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20: .Ldgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40 ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22: .Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22 bgt .Ldgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40: .Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100 ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42: .Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42 bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100: .Ldgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
dgemm_kernel_L1_M2_END: .Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN: .Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20: .Ldgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40 ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22: .Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22 bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40: .Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100 ble .Ldgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42: .Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42 bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100: .Ldgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
dgemm_kernel_L1_END: .Ldgemm_kernel_L1_END:
dgemm_kernel_L999: .Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L4_BEGIN ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L8_BEGIN: .Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3 add pC, pC, LDC, lsl #3
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dgemm_kernel_L8_M4_BEGIN: .Ldgemm_kernel_L8_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L8_M2_BEGIN ble .Ldgemm_kernel_L8_M2_BEGIN
dgemm_kernel_L8_M4_20: .Ldgemm_kernel_L8_M4_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L8_M4_32 blt .Ldgemm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble dgemm_kernel_L8_M4_22a ble .Ldgemm_kernel_L8_M4_22a
.align 5 .align 5
dgemm_kernel_L8_M4_22: .Ldgemm_kernel_L8_M4_22:
KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_M2 KERNEL4x8_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M4_22 bgt .Ldgemm_kernel_L8_M4_22
dgemm_kernel_L8_M4_22a: .Ldgemm_kernel_L8_M4_22a:
KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_E KERNEL4x8_E
b dgemm_kernel_L8_M4_44 b .Ldgemm_kernel_L8_M4_44
dgemm_kernel_L8_M4_32: .Ldgemm_kernel_L8_M4_32:
tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L8_M4_40 ble .Ldgemm_kernel_L8_M4_40
KERNEL4x8_I KERNEL4x8_I
KERNEL4x8_E KERNEL4x8_E
b dgemm_kernel_L8_M4_44 b .Ldgemm_kernel_L8_M4_44
dgemm_kernel_L8_M4_40: .Ldgemm_kernel_L8_M4_40:
INIT4x8 INIT4x8
dgemm_kernel_L8_M4_44: .Ldgemm_kernel_L8_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble dgemm_kernel_L8_M4_100 ble .Ldgemm_kernel_L8_M4_100
dgemm_kernel_L8_M4_46: .Ldgemm_kernel_L8_M4_46:
KERNEL4x8_SUB KERNEL4x8_SUB
dgemm_kernel_L8_M4_100: .Ldgemm_kernel_L8_M4_100:
SAVE4x8 SAVE4x8
dgemm_kernel_L8_M4_END: .Ldgemm_kernel_L8_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L8_M4_20 bne .Ldgemm_kernel_L8_M4_20
dgemm_kernel_L8_M2_BEGIN: .Ldgemm_kernel_L8_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L8_END ble .Ldgemm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L8_M1_BEGIN ble .Ldgemm_kernel_L8_M1_BEGIN
dgemm_kernel_L8_M2_20: .Ldgemm_kernel_L8_M2_20:
INIT2x8 INIT2x8
@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L8_M2_40 ble .Ldgemm_kernel_L8_M2_40
dgemm_kernel_L8_M2_22: .Ldgemm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB
KERNEL2x8_SUB KERNEL2x8_SUB
@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_22 bgt .Ldgemm_kernel_L8_M2_22
dgemm_kernel_L8_M2_40: .Ldgemm_kernel_L8_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M2_100 ble .Ldgemm_kernel_L8_M2_100
dgemm_kernel_L8_M2_42: .Ldgemm_kernel_L8_M2_42:
KERNEL2x8_SUB KERNEL2x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_42 bgt .Ldgemm_kernel_L8_M2_42
dgemm_kernel_L8_M2_100: .Ldgemm_kernel_L8_M2_100:
SAVE2x8 SAVE2x8
dgemm_kernel_L8_M2_END: .Ldgemm_kernel_L8_M2_END:
dgemm_kernel_L8_M1_BEGIN: .Ldgemm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L8_END ble .Ldgemm_kernel_L8_END
dgemm_kernel_L8_M1_20: .Ldgemm_kernel_L8_M1_20:
INIT1x8 INIT1x8
@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L8_M1_40 ble .Ldgemm_kernel_L8_M1_40
dgemm_kernel_L8_M1_22: .Ldgemm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_22 bgt .Ldgemm_kernel_L8_M1_22
dgemm_kernel_L8_M1_40: .Ldgemm_kernel_L8_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M1_100 ble .Ldgemm_kernel_L8_M1_100
dgemm_kernel_L8_M1_42: .Ldgemm_kernel_L8_M1_42:
KERNEL1x8_SUB KERNEL1x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_42 bgt .Ldgemm_kernel_L8_M1_42
dgemm_kernel_L8_M1_100: .Ldgemm_kernel_L8_M1_100:
SAVE1x8 SAVE1x8
dgemm_kernel_L8_END: .Ldgemm_kernel_L8_END:
lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8 add origPB, origPB, temp // B = B + K * 8 * 8
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L8_BEGIN bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L4_BEGIN: .Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #7 tst counterJ , #7
ble dgemm_kernel_L999 ble .Ldgemm_kernel_L999
tst counterJ , #4 tst counterJ , #4
ble dgemm_kernel_L2_BEGIN ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M4_BEGIN: .Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M2_BEGIN ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20: .Ldgemm_kernel_L4_M4_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M4_32 blt .Ldgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble dgemm_kernel_L4_M4_22a ble .Ldgemm_kernel_L4_M4_22a
.align 5 .align 5
dgemm_kernel_L4_M4_22: .Ldgemm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22 bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_22a: .Ldgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b dgemm_kernel_L4_M4_44 b .Ldgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_32: .Ldgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M4_40 ble .Ldgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b dgemm_kernel_L4_M4_44 b .Ldgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_40: .Ldgemm_kernel_L4_M4_40:
INIT4x4 INIT4x4
dgemm_kernel_L4_M4_44: .Ldgemm_kernel_L4_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble dgemm_kernel_L4_M4_100 ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_46: .Ldgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
dgemm_kernel_L4_M4_100: .Ldgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
dgemm_kernel_L4_M4_END: .Ldgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M4_20 bne .Ldgemm_kernel_L4_M4_20
dgemm_kernel_L4_M2_BEGIN: .Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20: .Ldgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40 ble .Ldgemm_kernel_L4_M2_40
dgemm_kernel_L4_M2_22: .Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22 bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40: .Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100 ble .Ldgemm_kernel_L4_M2_100
dgemm_kernel_L4_M2_42: .Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42 bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100: .Ldgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
dgemm_kernel_L4_M2_END: .Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN: .Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20: .Ldgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40 ble .Ldgemm_kernel_L4_M1_40
dgemm_kernel_L4_M1_22: .Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22 bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40: .Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100 ble .Ldgemm_kernel_L4_M1_100
dgemm_kernel_L4_M1_42: .Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42 bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100: .Ldgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
dgemm_kernel_L4_END: .Ldgemm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4? ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
dgemm_kernel_L2_M4_BEGIN: .Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20: .Ldgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40 ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5
dgemm_kernel_L2_M4_22: .Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22 bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40: .Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100 ble .Ldgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42: .Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42 bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100: .Ldgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
dgemm_kernel_L2_M4_END: .Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20 bgt .Ldgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN: .Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20: .Ldgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40 ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22: .Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22 bgt .Ldgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40: .Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100 ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42: .Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42 bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100: .Ldgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
dgemm_kernel_L2_M2_END: .Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN: .Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20: .Ldgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40 ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22: .Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22 bgt .Ldgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40: .Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100 ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42: .Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42 bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100: .Ldgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
dgemm_kernel_L2_END: .Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L1_BEGIN: .Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A
dgemm_kernel_L1_M4_BEGIN: .Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20: .Ldgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40 ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5
dgemm_kernel_L1_M4_22: .Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22 bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40: .Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100 ble .Ldgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42: .Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42 bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100: .Ldgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
dgemm_kernel_L1_M4_END: .Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20 bgt .Ldgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN: .Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20: .Ldgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40 ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22: .Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22 bgt .Ldgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40: .Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100 ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42: .Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42 bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100: .Ldgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
dgemm_kernel_L1_M2_END: .Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN: .Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20: .Ldgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40 ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22: .Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22 bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40: .Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100 ble .Ldgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42: .Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42 bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100: .Ldgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
dgemm_kernel_L1_END: .Ldgemm_kernel_L1_END:
dgemm_kernel_L999: .Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
.align 5 .align 5
dgemm_kernel_L4_BEGIN: .Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M8_BEGIN: .Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5 .align 5
dgemm_kernel_L4_M8_20: .Ldgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // L = K / 8 asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32 blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a ble .Ldgemm_kernel_L4_M8_22a
.align 5 .align 5
dgemm_kernel_L4_M8_22: .Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22 bgt .Ldgemm_kernel_L4_M8_22
.align 5 .align 5
dgemm_kernel_L4_M8_22a: .Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b .Ldgemm_kernel_L4_M8_44
.align 5 .align 5
dgemm_kernel_L4_M8_32: .Ldgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40 ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40: .Ldgemm_kernel_L4_M8_40:
INIT8x4 INIT8x4
dgemm_kernel_L4_M8_44: .Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #7 ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100 ble .Ldgemm_kernel_L4_M8_100
.align 5 .align 5
dgemm_kernel_L4_M8_46: .Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46 bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100: .Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
SAVE8x4 SAVE8x4
dgemm_kernel_L4_M8_END: .Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20 bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN: .Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20: .Ldgemm_kernel_L4_M4_20:
INIT4x4 INIT4x4
@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M4_40 ble .Ldgemm_kernel_L4_M4_40
.align 5 .align 5
dgemm_kernel_L4_M4_22: .Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22 bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40: .Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100 ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42: .Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42 bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100: .Ldgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
dgemm_kernel_L4_M4_END: .Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN: .Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20: .Ldgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40 ble .Ldgemm_kernel_L4_M2_40
.align 5 .align 5
dgemm_kernel_L4_M2_22: .Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22 bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40: .Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100 ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42: .Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42 bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100: .Ldgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
dgemm_kernel_L4_M2_END: .Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN: .Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20: .Ldgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40 ble .Ldgemm_kernel_L4_M1_40
.align 5 .align 5
dgemm_kernel_L4_M1_22: .Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22 bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40: .Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100 ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42: .Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42 bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100: .Ldgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
dgemm_kernel_L4_END: .Ldgemm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4? ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
dgemm_kernel_L2_M8_BEGIN: .Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5 .align 5
dgemm_kernel_L2_M8_20: .Ldgemm_kernel_L2_M8_20:
INIT8x2 INIT8x2
@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M8_40 ble .Ldgemm_kernel_L2_M8_40
.align 5 .align 5
dgemm_kernel_L2_M8_22: .Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22 bgt .Ldgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40: .Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100 ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42: .Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42 bgt .Ldgemm_kernel_L2_M8_42
dgemm_kernel_L2_M8_100: .Ldgemm_kernel_L2_M8_100:
SAVE8x2 SAVE8x2
dgemm_kernel_L2_M8_END: .Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20 bgt .Ldgemm_kernel_L2_M8_20
dgemm_kernel_L2_M4_BEGIN: .Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20: .Ldgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40 ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5
dgemm_kernel_L2_M4_22: .Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22 bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40: .Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100 ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42: .Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42 bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100: .Ldgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
dgemm_kernel_L2_M4_END: .Ldgemm_kernel_L2_M4_END:
dgemm_kernel_L2_M2_BEGIN: .Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20: .Ldgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40 ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22: .Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22 bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40: .Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100 ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42: .Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42 bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100: .Ldgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
dgemm_kernel_L2_M2_END: .Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN: .Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20: .Ldgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40 ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22: .Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22 bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40: .Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100 ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42: .Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42 bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100: .Ldgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
dgemm_kernel_L2_END: .Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L1_BEGIN: .Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A mov pA, origPA // pA = A
dgemm_kernel_L1_M8_BEGIN: .Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5 .align 5
dgemm_kernel_L1_M8_20: .Ldgemm_kernel_L1_M8_20:
INIT8x1 INIT8x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M8_40 ble .Ldgemm_kernel_L1_M8_40
.align 5 .align 5
dgemm_kernel_L1_M8_22: .Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22 bgt .Ldgemm_kernel_L1_M8_22
dgemm_kernel_L1_M8_40: .Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100 ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42: .Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42 bgt .Ldgemm_kernel_L1_M8_42
dgemm_kernel_L1_M8_100: .Ldgemm_kernel_L1_M8_100:
SAVE8x1 SAVE8x1
dgemm_kernel_L1_M8_END: .Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20 bgt .Ldgemm_kernel_L1_M8_20
dgemm_kernel_L1_M4_BEGIN: .Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20: .Ldgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40 ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5
dgemm_kernel_L1_M4_22: .Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22 bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40: .Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100 ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42: .Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42 bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100: .Ldgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
dgemm_kernel_L1_M4_END: .Ldgemm_kernel_L1_M4_END:
dgemm_kernel_L1_M2_BEGIN: .Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20: .Ldgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40 ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22: .Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22 bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40: .Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100 ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42: .Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42 bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100: .Ldgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
dgemm_kernel_L1_M2_END: .Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN: .Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20: .Ldgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40 ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22: .Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22 bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40: .Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100 ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42: .Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42 bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100: .Ldgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
dgemm_kernel_L1_END: .Ldgemm_kernel_L1_END:
dgemm_kernel_L999: .Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
.align 5 .align 5
dgemm_kernel_L4_BEGIN: .Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M8_BEGIN: .Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5 .align 5
dgemm_kernel_L4_M8_20: .Ldgemm_kernel_L4_M8_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #7 // L = K / 128 asr counterL , origK, #7 // L = K / 128
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32 blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x1 KERNEL8x4_M1_M2_x1
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a ble .Ldgemm_kernel_L4_M8_22a
.align 5 .align 5
dgemm_kernel_L4_M8_22: .Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x64 KERNEL8x4_M1_M2_x64
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22 bgt .Ldgemm_kernel_L4_M8_22
.align 5 .align 5
dgemm_kernel_L4_M8_22a: .Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x32
KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x16
@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b .Ldgemm_kernel_L4_M8_44
.align 5 .align 5
dgemm_kernel_L4_M8_32: .Ldgemm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40 ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dgemm_kernel_L4_M8_44 b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40: .Ldgemm_kernel_L4_M8_40:
INIT8x4 INIT8x4
dgemm_kernel_L4_M8_44: .Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #127 ands counterL , origK, #127
ble dgemm_kernel_L4_M8_100 ble .Ldgemm_kernel_L4_M8_100
.align 5 .align 5
dgemm_kernel_L4_M8_46: .Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46 bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100: .Ldgemm_kernel_L4_M8_100:
prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100:
SAVE8x4 SAVE8x4
dgemm_kernel_L4_M8_END: .Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20 bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN: .Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20: .Ldgemm_kernel_L4_M4_20:
INIT4x4 INIT4x4
@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M4_40 ble .Ldgemm_kernel_L4_M4_40
.align 5 .align 5
dgemm_kernel_L4_M4_22: .Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22 bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40: .Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100 ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42: .Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42 bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100: .Ldgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
dgemm_kernel_L4_M4_END: .Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN: .Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20: .Ldgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40 ble .Ldgemm_kernel_L4_M2_40
.align 5 .align 5
dgemm_kernel_L4_M2_22: .Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22 bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40: .Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100 ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
dgemm_kernel_L4_M2_42: .Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42 bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100: .Ldgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
dgemm_kernel_L4_M2_END: .Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN: .Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20: .Ldgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40 ble .Ldgemm_kernel_L4_M1_40
.align 5 .align 5
dgemm_kernel_L4_M1_22: .Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22 bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40: .Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100 ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
dgemm_kernel_L4_M1_42: .Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42 bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100: .Ldgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
dgemm_kernel_L4_END: .Ldgemm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4? ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
dgemm_kernel_L2_M8_BEGIN: .Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5 .align 5
dgemm_kernel_L2_M8_20: .Ldgemm_kernel_L2_M8_20:
INIT8x2 INIT8x2
@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M8_40 ble .Ldgemm_kernel_L2_M8_40
.align 5 .align 5
dgemm_kernel_L2_M8_22: .Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22 bgt .Ldgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40: .Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100 ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M8_42: .Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42 bgt .Ldgemm_kernel_L2_M8_42
dgemm_kernel_L2_M8_100: .Ldgemm_kernel_L2_M8_100:
SAVE8x2 SAVE8x2
dgemm_kernel_L2_M8_END: .Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20 bgt .Ldgemm_kernel_L2_M8_20
dgemm_kernel_L2_M4_BEGIN: .Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20: .Ldgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40 ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5
dgemm_kernel_L2_M4_22: .Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22 bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40: .Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100 ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M4_42: .Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42 bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100: .Ldgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
dgemm_kernel_L2_M4_END: .Ldgemm_kernel_L2_M4_END:
dgemm_kernel_L2_M2_BEGIN: .Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20: .Ldgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40 ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22: .Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22 bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M2_40: .Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100 ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42: .Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42 bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100: .Ldgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
dgemm_kernel_L2_M2_END: .Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN: .Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20: .Ldgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40 ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22: .Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22 bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M1_40: .Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100 ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42: .Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42 bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100: .Ldgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
dgemm_kernel_L2_END: .Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/ /******************************************************************************/
dgemm_kernel_L1_BEGIN: .Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A mov pA, origPA // pA = A
dgemm_kernel_L1_M8_BEGIN: .Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5 .align 5
dgemm_kernel_L1_M8_20: .Ldgemm_kernel_L1_M8_20:
INIT8x1 INIT8x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M8_40 ble .Ldgemm_kernel_L1_M8_40
.align 5 .align 5
dgemm_kernel_L1_M8_22: .Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22 bgt .Ldgemm_kernel_L1_M8_22
dgemm_kernel_L1_M8_40: .Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100 ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M8_42: .Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42 bgt .Ldgemm_kernel_L1_M8_42
dgemm_kernel_L1_M8_100: .Ldgemm_kernel_L1_M8_100:
SAVE8x1 SAVE8x1
dgemm_kernel_L1_M8_END: .Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20 bgt .Ldgemm_kernel_L1_M8_20
dgemm_kernel_L1_M4_BEGIN: .Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20: .Ldgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40 ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5
dgemm_kernel_L1_M4_22: .Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22 bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40: .Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100 ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M4_42: .Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42 bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100: .Ldgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
dgemm_kernel_L1_M4_END: .Ldgemm_kernel_L1_M4_END:
dgemm_kernel_L1_M2_BEGIN: .Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20: .Ldgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40 ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22: .Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22 bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M2_40: .Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100 ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42: .Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42 bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100: .Ldgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
dgemm_kernel_L1_M2_END: .Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN: .Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20: .Ldgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40 ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22: .Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22 bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40: .Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100 ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M1_42: .Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42 bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100: .Ldgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
dgemm_kernel_L1_END: .Ldgemm_kernel_L1_END:
dgemm_kernel_L999: .Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl LDA, LDA, #3 // LDA = LDA * SIZE
dgemm_ncopy_L4_BEGIN: .Ldgemm_ncopy_L4_BEGIN:
asr J, N, #2 // J = N / 4 asr J, N, #2 // J = N / 4
cmp J, #0 cmp J, #0
ble dgemm_ncopy_L2_BEGIN ble .Ldgemm_ncopy_L2_BEGIN
.align 5 .align 5
dgemm_ncopy_L4_M4_BEGIN: .Ldgemm_ncopy_L4_M4_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN:
asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M4_40 ble .Ldgemm_ncopy_L4_M4_40
.align 5 .align 5
dgemm_ncopy_L4_M4_20: .Ldgemm_ncopy_L4_M4_20:
COPY4x4 COPY4x4
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M4_20 bne .Ldgemm_ncopy_L4_M4_20
dgemm_ncopy_L4_M4_40: .Ldgemm_ncopy_L4_M4_40:
and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M4_END ble .Ldgemm_ncopy_L4_M4_END
.align 5 .align 5
dgemm_ncopy_L4_M4_60: .Ldgemm_ncopy_L4_M4_60:
COPY1x4 COPY1x4
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M4_60 bne .Ldgemm_ncopy_L4_M4_60
dgemm_ncopy_L4_M4_END: .Ldgemm_ncopy_L4_M4_END:
subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN bne .Ldgemm_ncopy_L4_M4_BEGIN
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_ncopy_L2_BEGIN: .Ldgemm_ncopy_L2_BEGIN:
tst N, #3 tst N, #3
ble dgemm_ncopy_L999 ble .Ldgemm_ncopy_L999
tst N, #2 tst N, #2
ble dgemm_ncopy_L1_BEGIN ble .Ldgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M4_BEGIN: .Ldgemm_ncopy_L2_M4_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
add A00, A02, LDA add A00, A02, LDA
asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M4_40 ble .Ldgemm_ncopy_L2_M4_40
.align 5 .align 5
dgemm_ncopy_L2_M4_20: .Ldgemm_ncopy_L2_M4_20:
COPY4x2 COPY4x2
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M4_20 bne .Ldgemm_ncopy_L2_M4_20
dgemm_ncopy_L2_M4_40: .Ldgemm_ncopy_L2_M4_40:
and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M4_END ble .Ldgemm_ncopy_L2_M4_END
.align 5 .align 5
dgemm_ncopy_L2_M4_60: .Ldgemm_ncopy_L2_M4_60:
COPY1x2 COPY1x2
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M4_60 bne .Ldgemm_ncopy_L2_M4_60
dgemm_ncopy_L2_M4_END: .Ldgemm_ncopy_L2_M4_END:
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_ncopy_L1_BEGIN: .Ldgemm_ncopy_L1_BEGIN:
tst N, #1 tst N, #1
ble dgemm_ncopy_L999 ble .Ldgemm_ncopy_L999
dgemm_ncopy_L1_M4_BEGIN: .Ldgemm_ncopy_L1_M4_BEGIN:
mov A01, A00 mov A01, A00
asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M4_40 ble .Ldgemm_ncopy_L1_M4_40
.align 5 .align 5
dgemm_ncopy_L1_M4_20: .Ldgemm_ncopy_L1_M4_20:
COPY4x1 COPY4x1
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M4_20 bne .Ldgemm_ncopy_L1_M4_20
dgemm_ncopy_L1_M4_40: .Ldgemm_ncopy_L1_M4_40:
and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M4_END ble .Ldgemm_ncopy_L1_M4_END
.align 5 .align 5
dgemm_ncopy_L1_M4_60: .Ldgemm_ncopy_L1_M4_60:
COPY1x1 COPY1x1
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M4_60 bne .Ldgemm_ncopy_L1_M4_60
dgemm_ncopy_L1_M4_END: .Ldgemm_ncopy_L1_M4_END:
dgemm_ncopy_L999: .Ldgemm_ncopy_L999:
mov x0, #0 mov x0, #0
RESTORE_REGS RESTORE_REGS

View File

@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl LDA, LDA, #3 // LDA = LDA * SIZE
dgemm_ncopy_L8_BEGIN: .Ldgemm_ncopy_L8_BEGIN:
asr J, N, #3 // J = N / 8 asr J, N, #3 // J = N / 8
cmp J, #0 cmp J, #0
ble dgemm_ncopy_L4_BEGIN ble .Ldgemm_ncopy_L4_BEGIN
dgemm_ncopy_L8_M8_BEGIN: .Ldgemm_ncopy_L8_M8_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN:
asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L8_M8_40 ble .Ldgemm_ncopy_L8_M8_40
dgemm_ncopy_L8_M8_20: .Ldgemm_ncopy_L8_M8_20:
COPY8x8 COPY8x8
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L8_M8_20 bne .Ldgemm_ncopy_L8_M8_20
dgemm_ncopy_L8_M8_40: .Ldgemm_ncopy_L8_M8_40:
and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L8_M8_END ble .Ldgemm_ncopy_L8_M8_END
dgemm_ncopy_L8_M8_60: .Ldgemm_ncopy_L8_M8_60:
COPY1x8 COPY1x8
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L8_M8_60 bne .Ldgemm_ncopy_L8_M8_60
dgemm_ncopy_L8_M8_END: .Ldgemm_ncopy_L8_M8_END:
subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_ncopy_L8_M8_BEGIN bne .Ldgemm_ncopy_L8_M8_BEGIN
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_ncopy_L4_BEGIN: .Ldgemm_ncopy_L4_BEGIN:
tst N, #7 tst N, #7
ble dgemm_ncopy_L999 ble .Ldgemm_ncopy_L999
tst N, #4 tst N, #4
ble dgemm_ncopy_L2_BEGIN ble .Ldgemm_ncopy_L2_BEGIN
dgemm_ncopy_L4_M8_BEGIN: .Ldgemm_ncopy_L4_M8_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN:
asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M8_40 ble .Ldgemm_ncopy_L4_M8_40
dgemm_ncopy_L4_M8_20: .Ldgemm_ncopy_L4_M8_20:
COPY8x4 COPY8x4
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M8_20 bne .Ldgemm_ncopy_L4_M8_20
dgemm_ncopy_L4_M8_40: .Ldgemm_ncopy_L4_M8_40:
and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M8_END ble .Ldgemm_ncopy_L4_M8_END
dgemm_ncopy_L4_M8_60: .Ldgemm_ncopy_L4_M8_60:
COPY1x4 COPY1x4
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M8_60 bne .Ldgemm_ncopy_L4_M8_60
dgemm_ncopy_L4_M8_END: .Ldgemm_ncopy_L4_M8_END:
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_ncopy_L2_BEGIN: .Ldgemm_ncopy_L2_BEGIN:
tst N, #3 tst N, #3
ble dgemm_ncopy_L999 ble .Ldgemm_ncopy_L999
tst N, #2 tst N, #2
ble dgemm_ncopy_L1_BEGIN ble .Ldgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M8_BEGIN: .Ldgemm_ncopy_L2_M8_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
add A00, A02, LDA add A00, A02, LDA
asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M8_40 ble .Ldgemm_ncopy_L2_M8_40
dgemm_ncopy_L2_M8_20: .Ldgemm_ncopy_L2_M8_20:
COPY8x2 COPY8x2
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M8_20 bne .Ldgemm_ncopy_L2_M8_20
dgemm_ncopy_L2_M8_40: .Ldgemm_ncopy_L2_M8_40:
and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M8_END ble .Ldgemm_ncopy_L2_M8_END
dgemm_ncopy_L2_M8_60: .Ldgemm_ncopy_L2_M8_60:
COPY1x2 COPY1x2
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M8_60 bne .Ldgemm_ncopy_L2_M8_60
dgemm_ncopy_L2_M8_END: .Ldgemm_ncopy_L2_M8_END:
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_ncopy_L1_BEGIN: .Ldgemm_ncopy_L1_BEGIN:
tst N, #1 tst N, #1
ble dgemm_ncopy_L999 ble .Ldgemm_ncopy_L999
dgemm_ncopy_L1_M8_BEGIN: .Ldgemm_ncopy_L1_M8_BEGIN:
mov A01, A00 mov A01, A00
asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M8_40 ble .Ldgemm_ncopy_L1_M8_40
dgemm_ncopy_L1_M8_20: .Ldgemm_ncopy_L1_M8_20:
COPY8x1 COPY8x1
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M8_20 bne .Ldgemm_ncopy_L1_M8_20
dgemm_ncopy_L1_M8_40: .Ldgemm_ncopy_L1_M8_40:
and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M8_END ble .Ldgemm_ncopy_L1_M8_END
dgemm_ncopy_L1_M8_60: .Ldgemm_ncopy_L1_M8_60:
COPY1x1 COPY1x1
subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M8_60 bne .Ldgemm_ncopy_L1_M8_60
dgemm_ncopy_L1_M8_END: .Ldgemm_ncopy_L1_M8_END:
dgemm_ncopy_L999: .Ldgemm_ncopy_L999:
mov x0, #0 mov x0, #0
RESTORE_REGS RESTORE_REGS

View File

@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl M4, M, #5 // M4 = M * 4 * SIZE lsl M4, M, #5 // M4 = M * 4 * SIZE
dgemm_tcopy_L4_BEGIN: .Ldgemm_tcopy_L4_BEGIN:
asr J, M, #2 // J = M / 4 asr J, M, #2 // J = M / 4
cmp J, #0 cmp J, #0
ble dgemm_tcopy_L2_BEGIN ble .Ldgemm_tcopy_L2_BEGIN
.align 5 .align 5
dgemm_tcopy_L4_M4_BEGIN: .Ldgemm_tcopy_L4_M4_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN:
asr I, N, #2 // I = N / 4 asr I, N, #2 // I = N / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L4_M4_40 ble .Ldgemm_tcopy_L4_M4_40
.align 5 .align 5
dgemm_tcopy_L4_M4_20: .Ldgemm_tcopy_L4_M4_20:
COPY4x4 COPY4x4
subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L4_M4_20 bne .Ldgemm_tcopy_L4_M4_20
dgemm_tcopy_L4_M4_40: .Ldgemm_tcopy_L4_M4_40:
tst N , #2 tst N , #2
ble dgemm_tcopy_L4_M4_60 ble .Ldgemm_tcopy_L4_M4_60
COPY2x4 COPY2x4
dgemm_tcopy_L4_M4_60: .Ldgemm_tcopy_L4_M4_60:
tst N, #1 tst N, #1
ble dgemm_tcopy_L4_M4_END ble .Ldgemm_tcopy_L4_M4_END
COPY1x4 COPY1x4
dgemm_tcopy_L4_M4_END: .Ldgemm_tcopy_L4_M4_END:
subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN bne .Ldgemm_tcopy_L4_M4_BEGIN
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_tcopy_L2_BEGIN: .Ldgemm_tcopy_L2_BEGIN:
tst M, #3 tst M, #3
ble dgemm_tcopy_L999 ble .Ldgemm_tcopy_L999
tst M, #2 tst M, #2
ble dgemm_tcopy_L1_BEGIN ble .Ldgemm_tcopy_L1_BEGIN
dgemm_tcopy_L2_M4_BEGIN: .Ldgemm_tcopy_L2_M4_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
add A, A02, LDA add A, A02, LDA
@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN:
asr I, N, #2 // I = N / 4 asr I, N, #2 // I = N / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L2_M4_40 ble .Ldgemm_tcopy_L2_M4_40
.align 5 .align 5
dgemm_tcopy_L2_M4_20: .Ldgemm_tcopy_L2_M4_20:
COPY4x2 COPY4x2
subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L2_M4_20 bne .Ldgemm_tcopy_L2_M4_20
dgemm_tcopy_L2_M4_40: .Ldgemm_tcopy_L2_M4_40:
tst N , #2 tst N , #2
ble dgemm_tcopy_L2_M4_60 ble .Ldgemm_tcopy_L2_M4_60
COPY2x2 COPY2x2
dgemm_tcopy_L2_M4_60: .Ldgemm_tcopy_L2_M4_60:
tst N , #1 tst N , #1
ble dgemm_tcopy_L2_M4_END ble .Ldgemm_tcopy_L2_M4_END
COPY1x2 COPY1x2
dgemm_tcopy_L2_M4_END: .Ldgemm_tcopy_L2_M4_END:
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_tcopy_L1_BEGIN: .Ldgemm_tcopy_L1_BEGIN:
tst M, #1 tst M, #1
ble dgemm_tcopy_L999 ble .Ldgemm_tcopy_L999
dgemm_tcopy_L1_M4_BEGIN: .Ldgemm_tcopy_L1_M4_BEGIN:
mov A01, A // A01 = A mov A01, A // A01 = A
mov B01, B mov B01, B
asr I, N, #2 // I = M / 4 asr I, N, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L1_M4_40 ble .Ldgemm_tcopy_L1_M4_40
.align 5 .align 5
dgemm_tcopy_L1_M4_20: .Ldgemm_tcopy_L1_M4_20:
COPY4x1 COPY4x1
subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L1_M4_20 bne .Ldgemm_tcopy_L1_M4_20
dgemm_tcopy_L1_M4_40: .Ldgemm_tcopy_L1_M4_40:
tst N , #2 tst N , #2
ble dgemm_tcopy_L1_M4_60 ble .Ldgemm_tcopy_L1_M4_60
COPY2x1 COPY2x1
dgemm_tcopy_L1_M4_60: .Ldgemm_tcopy_L1_M4_60:
tst N , #1 tst N , #1
ble dgemm_tcopy_L1_M4_END ble .Ldgemm_tcopy_L1_M4_END
COPY1x1 COPY1x1
dgemm_tcopy_L1_M4_END: .Ldgemm_tcopy_L1_M4_END:
dgemm_tcopy_L999: .Ldgemm_tcopy_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
RESTORE_REGS RESTORE_REGS
ret ret

View File

@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lsl M8, M, #6 // M8 = M * 8 * SIZE lsl M8, M, #6 // M8 = M * 8 * SIZE
dgemm_tcopy_L8_BEGIN: .Ldgemm_tcopy_L8_BEGIN:
asr J, M, #3 // J = M / 4 asr J, M, #3 // J = M / 4
cmp J, #0 cmp J, #0
ble dgemm_tcopy_L4_BEGIN ble .Ldgemm_tcopy_L4_BEGIN
.align 5 .align 5
dgemm_tcopy_L8_M8_BEGIN: .Ldgemm_tcopy_L8_M8_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN:
asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L8_M8_40 ble .Ldgemm_tcopy_L8_M8_40
.align 5 .align 5
dgemm_tcopy_L8_M8_20: .Ldgemm_tcopy_L8_M8_20:
COPY8x8 COPY8x8
subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L8_M8_20 bne .Ldgemm_tcopy_L8_M8_20
dgemm_tcopy_L8_M8_40: .Ldgemm_tcopy_L8_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L8_M8_60 ble .Ldgemm_tcopy_L8_M8_60
COPY4x8 COPY4x8
dgemm_tcopy_L8_M8_60: .Ldgemm_tcopy_L8_M8_60:
tst N , #2 tst N , #2
ble dgemm_tcopy_L8_M8_80 ble .Ldgemm_tcopy_L8_M8_80
COPY2x8 COPY2x8
dgemm_tcopy_L8_M8_80: .Ldgemm_tcopy_L8_M8_80:
tst N, #1 tst N, #1
ble dgemm_tcopy_L8_M8_END ble .Ldgemm_tcopy_L8_M8_END
COPY1x8 COPY1x8
dgemm_tcopy_L8_M8_END: .Ldgemm_tcopy_L8_M8_END:
subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_tcopy_L8_M8_BEGIN bne .Ldgemm_tcopy_L8_M8_BEGIN
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_tcopy_L4_BEGIN: .Ldgemm_tcopy_L4_BEGIN:
tst M, #7 tst M, #7
ble dgemm_tcopy_L999 ble .Ldgemm_tcopy_L999
tst M, #4 tst M, #4
ble dgemm_tcopy_L2_BEGIN ble .Ldgemm_tcopy_L2_BEGIN
dgemm_tcopy_L4_M8_BEGIN: .Ldgemm_tcopy_L4_M8_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN:
asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L4_M8_40 ble .Ldgemm_tcopy_L4_M8_40
.align 5 .align 5
dgemm_tcopy_L4_M8_20: .Ldgemm_tcopy_L4_M8_20:
COPY8x4 COPY8x4
subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L4_M8_20 bne .Ldgemm_tcopy_L4_M8_20
dgemm_tcopy_L4_M8_40: .Ldgemm_tcopy_L4_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L4_M8_60 ble .Ldgemm_tcopy_L4_M8_60
COPY4x4 COPY4x4
dgemm_tcopy_L4_M8_60: .Ldgemm_tcopy_L4_M8_60:
tst N , #2 tst N , #2
ble dgemm_tcopy_L4_M8_80 ble .Ldgemm_tcopy_L4_M8_80
COPY2x4 COPY2x4
dgemm_tcopy_L4_M8_80: .Ldgemm_tcopy_L4_M8_80:
tst N, #1 tst N, #1
ble dgemm_tcopy_L4_M8_END ble .Ldgemm_tcopy_L4_M8_END
COPY1x4 COPY1x4
dgemm_tcopy_L4_M8_END: .Ldgemm_tcopy_L4_M8_END:
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_tcopy_L2_BEGIN: .Ldgemm_tcopy_L2_BEGIN:
tst M, #3 tst M, #3
ble dgemm_tcopy_L999 ble .Ldgemm_tcopy_L999
tst M, #2 tst M, #2
ble dgemm_tcopy_L1_BEGIN ble .Ldgemm_tcopy_L1_BEGIN
dgemm_tcopy_L2_M8_BEGIN: .Ldgemm_tcopy_L2_M8_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
add A, A02, LDA add A, A02, LDA
@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN:
asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L2_M8_40 ble .Ldgemm_tcopy_L2_M8_40
.align 5 .align 5
dgemm_tcopy_L2_M8_20: .Ldgemm_tcopy_L2_M8_20:
COPY8x2 COPY8x2
subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L2_M8_20 bne .Ldgemm_tcopy_L2_M8_20
dgemm_tcopy_L2_M8_40: .Ldgemm_tcopy_L2_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L2_M8_60 ble .Ldgemm_tcopy_L2_M8_60
COPY4x2 COPY4x2
dgemm_tcopy_L2_M8_60: .Ldgemm_tcopy_L2_M8_60:
tst N , #2 tst N , #2
ble dgemm_tcopy_L2_M8_80 ble .Ldgemm_tcopy_L2_M8_80
COPY2x2 COPY2x2
dgemm_tcopy_L2_M8_80: .Ldgemm_tcopy_L2_M8_80:
tst N , #1 tst N , #1
ble dgemm_tcopy_L2_M8_END ble .Ldgemm_tcopy_L2_M8_END
COPY1x2 COPY1x2
dgemm_tcopy_L2_M8_END: .Ldgemm_tcopy_L2_M8_END:
/*********************************************************************************************/ /*********************************************************************************************/
dgemm_tcopy_L1_BEGIN: .Ldgemm_tcopy_L1_BEGIN:
tst M, #1 tst M, #1
ble dgemm_tcopy_L999 ble .Ldgemm_tcopy_L999
dgemm_tcopy_L1_M8_BEGIN: .Ldgemm_tcopy_L1_M8_BEGIN:
mov A01, A // A01 = A mov A01, A // A01 = A
mov B01, B mov B01, B
asr I, N, #3 // I = M / 8 asr I, N, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L1_M8_40 ble .Ldgemm_tcopy_L1_M8_40
.align 5 .align 5
dgemm_tcopy_L1_M8_20: .Ldgemm_tcopy_L1_M8_20:
COPY8x1 COPY8x1
subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L1_M8_20 bne .Ldgemm_tcopy_L1_M8_20
dgemm_tcopy_L1_M8_40: .Ldgemm_tcopy_L1_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L1_M8_60 ble .Ldgemm_tcopy_L1_M8_60
COPY4x1 COPY4x1
dgemm_tcopy_L1_M8_60: .Ldgemm_tcopy_L1_M8_60:
tst N , #2 tst N , #2
ble dgemm_tcopy_L1_M8_80 ble .Ldgemm_tcopy_L1_M8_80
COPY2x1 COPY2x1
dgemm_tcopy_L1_M8_80: .Ldgemm_tcopy_L1_M8_80:
tst N , #1 tst N , #1
ble dgemm_tcopy_L1_M8_END ble .Ldgemm_tcopy_L1_M8_END
COPY1x1 COPY1x1
dgemm_tcopy_L1_M8_END: .Ldgemm_tcopy_L1_M8_END:
dgemm_tcopy_L999: .Ldgemm_tcopy_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
RESTORE_REGS RESTORE_REGS
ret ret

View File

@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
cmp N, xzr cmp N, xzr
ble dot_kernel_L999 ble .Ldot_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne dot_kernel_S_BEGIN bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne dot_kernel_S_BEGIN bne .Ldot_kernel_S_BEGIN
dot_kernel_F_BEGIN: .Ldot_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq dot_kernel_F1 beq .Ldot_kernel_F1
dot_kernel_F4: .Ldot_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne dot_kernel_F4 bne .Ldot_kernel_F4
KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE
dot_kernel_F1: .Ldot_kernel_F1:
ands I, N, #3 ands I, N, #3
ble dot_kernel_L999 ble .Ldot_kernel_L999
dot_kernel_F10: .Ldot_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne dot_kernel_F10 bne .Ldot_kernel_F10
ret ret
dot_kernel_S_BEGIN: .Ldot_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble dot_kernel_S1 ble .Ldot_kernel_S1
dot_kernel_S4: .Ldot_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -206,21 +206,21 @@ dot_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne dot_kernel_S4 bne .Ldot_kernel_S4
dot_kernel_S1: .Ldot_kernel_S1:
ands I, N, #3 ands I, N, #3
ble dot_kernel_L999 ble .Ldot_kernel_L999
dot_kernel_S10: .Ldot_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne dot_kernel_S10 bne .Ldot_kernel_S10
dot_kernel_L999: .Ldot_kernel_L999:
ret ret

View File

@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L4_BEGIN: .Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M4_BEGIN: .Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20: .Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32 blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a ble .Ldtrmm_kernel_L4_M4_22a
.align 5 .align 5
dtrmm_kernel_L4_M4_22: .Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22 bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_22a: .Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b dtrmm_kernel_L4_M4_44 b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_32: .Ldtrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M4_40 ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b dtrmm_kernel_L4_M4_44 b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_40: .Ldtrmm_kernel_L4_M4_40:
INIT4x4 INIT4x4
dtrmm_kernel_L4_M4_44: .Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1 ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100 ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_46: .Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
dtrmm_kernel_L4_M4_100: .Ldtrmm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L4_M4_END: .Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20 bne .Ldtrmm_kernel_L4_M4_20
dtrmm_kernel_L4_M2_BEGIN: .Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20: .Ldtrmm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40 ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22: .Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22 bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40: .Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100 ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42: .Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42 bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100: .Ldtrmm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L4_M2_END: .Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN: .Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20: .Ldtrmm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40 ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22: .Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22 bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40: .Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100 ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42: .Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42 bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100: .Ldtrmm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L4_END: .Ldtrmm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@ -838,19 +838,19 @@ dtrmm_kernel_L4_END:
#endif #endif
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4? ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
dtrmm_kernel_L2_M4_BEGIN: .Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20: .Ldtrmm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40 ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5
dtrmm_kernel_L2_M4_22: .Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22 bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40: .Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100 ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42: .Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42 bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100: .Ldtrmm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L2_M4_END: .Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20 bgt .Ldtrmm_kernel_L2_M4_20
dtrmm_kernel_L2_M2_BEGIN: .Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20: .Ldtrmm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40 ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22: .Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22 bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40: .Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100 ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42: .Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42 bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100: .Ldtrmm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L2_M2_END: .Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN: .Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20: .Ldtrmm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40 ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22: .Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22 bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40: .Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100 ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42: .Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42 bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100: .Ldtrmm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L2_END: .Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L1_BEGIN: .Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A
dtrmm_kernel_L1_M4_BEGIN: .Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20: .Ldtrmm_kernel_L1_M4_20:
INIT4x1 INIT4x1
@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40 ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5
dtrmm_kernel_L1_M4_22: .Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22 bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40: .Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100 ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42: .Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42 bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100: .Ldtrmm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L1_M4_END: .Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20 bgt .Ldtrmm_kernel_L1_M4_20
dtrmm_kernel_L1_M2_BEGIN: .Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20: .Ldtrmm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40 ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22: .Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22 bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40: .Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100 ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42: .Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42 bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100: .Ldtrmm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L1_M2_END: .Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN: .Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20: .Ldtrmm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40 ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22: .Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22 bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40: .Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100 ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42: .Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42 bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100: .Ldtrmm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
dtrmm_kernel_L1_END: .Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999: .Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L4_BEGIN ble .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L8_BEGIN: .Ldtrmm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3 add pC, pC, LDC, lsl #3
@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dtrmm_kernel_L8_M4_BEGIN: .Ldtrmm_kernel_L8_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L8_M2_BEGIN ble .Ldtrmm_kernel_L8_M2_BEGIN
dtrmm_kernel_L8_M4_20: .Ldtrmm_kernel_L8_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20:
asr counterL, tempK, #1 // L = K / 2 asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L8_M4_32 blt .Ldtrmm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L8_M4_22a ble .Ldtrmm_kernel_L8_M4_22a
.align 5 .align 5
dtrmm_kernel_L8_M4_22: .Ldtrmm_kernel_L8_M4_22:
KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_M2 KERNEL4x8_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M4_22 bgt .Ldtrmm_kernel_L8_M4_22
dtrmm_kernel_L8_M4_22a: .Ldtrmm_kernel_L8_M4_22a:
KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_E KERNEL4x8_E
b dtrmm_kernel_L8_M4_44 b .Ldtrmm_kernel_L8_M4_44
dtrmm_kernel_L8_M4_32: .Ldtrmm_kernel_L8_M4_32:
tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L8_M4_40 ble .Ldtrmm_kernel_L8_M4_40
KERNEL4x8_I KERNEL4x8_I
KERNEL4x8_E KERNEL4x8_E
b dtrmm_kernel_L8_M4_44 b .Ldtrmm_kernel_L8_M4_44
dtrmm_kernel_L8_M4_40: .Ldtrmm_kernel_L8_M4_40:
INIT4x8 INIT4x8
dtrmm_kernel_L8_M4_44: .Ldtrmm_kernel_L8_M4_44:
ands counterL, tempK, #1 ands counterL, tempK, #1
ble dtrmm_kernel_L8_M4_100 ble .Ldtrmm_kernel_L8_M4_100
dtrmm_kernel_L8_M4_46: .Ldtrmm_kernel_L8_M4_46:
KERNEL4x8_SUB KERNEL4x8_SUB
dtrmm_kernel_L8_M4_100: .Ldtrmm_kernel_L8_M4_100:
SAVE4x8 SAVE4x8
@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L8_M4_END: .Ldtrmm_kernel_L8_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L8_M4_20 bne .Ldtrmm_kernel_L8_M4_20
dtrmm_kernel_L8_M2_BEGIN: .Ldtrmm_kernel_L8_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L8_END ble .Ldtrmm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L8_M1_BEGIN ble .Ldtrmm_kernel_L8_M1_BEGIN
dtrmm_kernel_L8_M2_20: .Ldtrmm_kernel_L8_M2_20:
INIT2x8 INIT2x8
@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20:
asr counterL, tempK, #3 // counterL = counterL / 8 asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L8_M2_40 ble .Ldtrmm_kernel_L8_M2_40
dtrmm_kernel_L8_M2_22: .Ldtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB
KERNEL2x8_SUB KERNEL2x8_SUB
@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_22 bgt .Ldtrmm_kernel_L8_M2_22
dtrmm_kernel_L8_M2_40: .Ldtrmm_kernel_L8_M2_40:
ands counterL, tempK, #7 // counterL = counterL % 8 ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M2_100 ble .Ldtrmm_kernel_L8_M2_100
dtrmm_kernel_L8_M2_42: .Ldtrmm_kernel_L8_M2_42:
KERNEL2x8_SUB KERNEL2x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_42 bgt .Ldtrmm_kernel_L8_M2_42
dtrmm_kernel_L8_M2_100: .Ldtrmm_kernel_L8_M2_100:
SAVE2x8 SAVE2x8
@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L8_M2_END: .Ldtrmm_kernel_L8_M2_END:
dtrmm_kernel_L8_M1_BEGIN: .Ldtrmm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L8_END ble .Ldtrmm_kernel_L8_END
dtrmm_kernel_L8_M1_20: .Ldtrmm_kernel_L8_M1_20:
INIT1x8 INIT1x8
@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20:
asr counterL, tempK, #3 // counterL = counterL / 8 asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L8_M1_40 ble .Ldtrmm_kernel_L8_M1_40
dtrmm_kernel_L8_M1_22: .Ldtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_22 bgt .Ldtrmm_kernel_L8_M1_22
dtrmm_kernel_L8_M1_40: .Ldtrmm_kernel_L8_M1_40:
ands counterL, tempK, #7 // counterL = counterL % 8 ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M1_100 ble .Ldtrmm_kernel_L8_M1_100
dtrmm_kernel_L8_M1_42: .Ldtrmm_kernel_L8_M1_42:
KERNEL1x8_SUB KERNEL1x8_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_42 bgt .Ldtrmm_kernel_L8_M1_42
dtrmm_kernel_L8_M1_100: .Ldtrmm_kernel_L8_M1_100:
SAVE1x8 SAVE1x8
@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L8_END: .Ldtrmm_kernel_L8_END:
lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8 add origPB, origPB, temp // B = B + K * 8 * 8
@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END:
#endif #endif
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L8_BEGIN bgt .Ldtrmm_kernel_L8_BEGIN
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L4_BEGIN: .Ldtrmm_kernel_L4_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #7 tst counterJ , #7
ble dtrmm_kernel_L999 ble .Ldtrmm_kernel_L999
tst counterJ , #4 tst counterJ , #4
ble dtrmm_kernel_L2_BEGIN ble .Ldtrmm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M4_BEGIN: .Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20: .Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20:
asr counterL, tempK, #1 // L = K / 2 asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32 blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a ble .Ldtrmm_kernel_L4_M4_22a
.align 5 .align 5
dtrmm_kernel_L4_M4_22: .Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22 bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_22a: .Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b dtrmm_kernel_L4_M4_44 b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_32: .Ldtrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M4_40 ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b dtrmm_kernel_L4_M4_44 b .Ldtrmm_kernel_L4_M4_44
dtrmm_kernel_L4_M4_40: .Ldtrmm_kernel_L4_M4_40:
INIT4x4 INIT4x4
dtrmm_kernel_L4_M4_44: .Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1 ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100 ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_46: .Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
dtrmm_kernel_L4_M4_100: .Ldtrmm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L4_M4_END: .Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20 bne .Ldtrmm_kernel_L4_M4_20
dtrmm_kernel_L4_M2_BEGIN: .Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20: .Ldtrmm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40 ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22: .Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22 bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40: .Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100 ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42: .Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42 bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100: .Ldtrmm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L4_M2_END: .Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN: .Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20: .Ldtrmm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40 ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22: .Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22 bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40: .Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100 ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42: .Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42 bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100: .Ldtrmm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L4_END: .Ldtrmm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END:
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4? ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
dtrmm_kernel_L2_M4_BEGIN: .Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20: .Ldtrmm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40 ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5
dtrmm_kernel_L2_M4_22: .Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22 bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40: .Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100 ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42: .Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42 bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100: .Ldtrmm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L2_M4_END: .Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20 bgt .Ldtrmm_kernel_L2_M4_20
dtrmm_kernel_L2_M2_BEGIN: .Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20: .Ldtrmm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40 ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22: .Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22 bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40: .Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100 ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42: .Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42 bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100: .Ldtrmm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L2_M2_END: .Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN: .Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20: .Ldtrmm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40 ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22: .Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22 bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40: .Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100 ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42: .Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42 bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100: .Ldtrmm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L2_END: .Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L1_BEGIN: .Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A
dtrmm_kernel_L1_M4_BEGIN: .Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20: .Ldtrmm_kernel_L1_M4_20:
INIT4x1 INIT4x1
@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40 ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5
dtrmm_kernel_L1_M4_22: .Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22 bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40: .Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100 ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42: .Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42 bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100: .Ldtrmm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L1_M4_END: .Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20 bgt .Ldtrmm_kernel_L1_M4_20
dtrmm_kernel_L1_M2_BEGIN: .Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20: .Ldtrmm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40 ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22: .Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22 bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40: .Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100 ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42: .Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42 bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100: .Ldtrmm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L1_M2_END: .Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN: .Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20: .Ldtrmm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40 ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22: .Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22 bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40: .Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100 ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42: .Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42 bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100: .Ldtrmm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
dtrmm_kernel_L1_END: .Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999: .Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L4_BEGIN: .Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
dtrmm_kernel_L4_M8_BEGIN: .Ldtrmm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN ble .Ldtrmm_kernel_L4_M4_BEGIN
.align 5 .align 5
dtrmm_kernel_L4_M8_20: .Ldtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20:
asr counterL , tempK, #3 // L = K / 8 asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32 blt .Ldtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K KERNEL8x4_M2 // do another in the K
@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a ble .Ldtrmm_kernel_L4_M8_22a
.align 5 .align 5
dtrmm_kernel_L4_M8_22: .Ldtrmm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22 bgt .Ldtrmm_kernel_L4_M8_22
.align 5 .align 5
dtrmm_kernel_L4_M8_22a: .Ldtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dtrmm_kernel_L4_M8_44 b .Ldtrmm_kernel_L4_M8_44
.align 5 .align 5
dtrmm_kernel_L4_M8_32: .Ldtrmm_kernel_L4_M8_32:
tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M8_40 ble .Ldtrmm_kernel_L4_M8_40
KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E
b dtrmm_kernel_L4_M8_44 b .Ldtrmm_kernel_L4_M8_44
dtrmm_kernel_L4_M8_40: .Ldtrmm_kernel_L4_M8_40:
INIT8x4 INIT8x4
dtrmm_kernel_L4_M8_44: .Ldtrmm_kernel_L4_M8_44:
ands counterL , tempK, #7 ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100 ble .Ldtrmm_kernel_L4_M8_100
.align 5 .align 5
dtrmm_kernel_L4_M8_46: .Ldtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46 bne .Ldtrmm_kernel_L4_M8_46
dtrmm_kernel_L4_M8_100: .Ldtrmm_kernel_L4_M8_100:
SAVE8x4 SAVE8x4
@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
dtrmm_kernel_L4_M8_END: .Ldtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M8_20 bne .Ldtrmm_kernel_L4_M8_20
dtrmm_kernel_L4_M4_BEGIN: .Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L4_END ble .Ldtrmm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble dtrmm_kernel_L4_M2_BEGIN ble .Ldtrmm_kernel_L4_M2_BEGIN
dtrmm_kernel_L4_M4_20: .Ldtrmm_kernel_L4_M4_20:
INIT4x4 INIT4x4
@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M4_40 ble .Ldtrmm_kernel_L4_M4_40
dtrmm_kernel_L4_M4_22: .Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22 bgt .Ldtrmm_kernel_L4_M4_22
dtrmm_kernel_L4_M4_40: .Ldtrmm_kernel_L4_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M4_100 ble .Ldtrmm_kernel_L4_M4_100
dtrmm_kernel_L4_M4_42: .Ldtrmm_kernel_L4_M4_42:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_42 bgt .Ldtrmm_kernel_L4_M4_42
dtrmm_kernel_L4_M4_100: .Ldtrmm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L4_M4_END: .Ldtrmm_kernel_L4_M4_END:
dtrmm_kernel_L4_M2_BEGIN: .Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN ble .Ldtrmm_kernel_L4_M1_BEGIN
dtrmm_kernel_L4_M2_20: .Ldtrmm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40 ble .Ldtrmm_kernel_L4_M2_40
dtrmm_kernel_L4_M2_22: .Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22 bgt .Ldtrmm_kernel_L4_M2_22
dtrmm_kernel_L4_M2_40: .Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100 ble .Ldtrmm_kernel_L4_M2_100
dtrmm_kernel_L4_M2_42: .Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42 bgt .Ldtrmm_kernel_L4_M2_42
dtrmm_kernel_L4_M2_100: .Ldtrmm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L4_M2_END: .Ldtrmm_kernel_L4_M2_END:
dtrmm_kernel_L4_M1_BEGIN: .Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END ble .Ldtrmm_kernel_L4_END
dtrmm_kernel_L4_M1_20: .Ldtrmm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40 ble .Ldtrmm_kernel_L4_M1_40
dtrmm_kernel_L4_M1_22: .Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22 bgt .Ldtrmm_kernel_L4_M1_22
dtrmm_kernel_L4_M1_40: .Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100 ble .Ldtrmm_kernel_L4_M1_100
dtrmm_kernel_L4_M1_42: .Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42 bgt .Ldtrmm_kernel_L4_M1_42
dtrmm_kernel_L4_M1_100: .Ldtrmm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L4_END: .Ldtrmm_kernel_L4_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END:
#endif #endif
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4? ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A
dtrmm_kernel_L2_M8_BEGIN: .Ldtrmm_kernel_L2_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L2_M4_BEGIN ble .Ldtrmm_kernel_L2_M4_BEGIN
dtrmm_kernel_L2_M8_20: .Ldtrmm_kernel_L2_M8_20:
INIT8x2 INIT8x2
@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M8_40 ble .Ldtrmm_kernel_L2_M8_40
.align 5 .align 5
dtrmm_kernel_L2_M8_22: .Ldtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_22 bgt .Ldtrmm_kernel_L2_M8_22
dtrmm_kernel_L2_M8_40: .Ldtrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M8_100 ble .Ldtrmm_kernel_L2_M8_100
dtrmm_kernel_L2_M8_42: .Ldtrmm_kernel_L2_M8_42:
KERNEL8x2_SUB KERNEL8x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_42 bgt .Ldtrmm_kernel_L2_M8_42
dtrmm_kernel_L2_M8_100: .Ldtrmm_kernel_L2_M8_100:
SAVE8x2 SAVE8x2
@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif
dtrmm_kernel_L2_M8_END: .Ldtrmm_kernel_L2_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M8_20 bgt .Ldtrmm_kernel_L2_M8_20
dtrmm_kernel_L2_M4_BEGIN: .Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L2_END ble .Ldtrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L2_M2_BEGIN ble .Ldtrmm_kernel_L2_M2_BEGIN
dtrmm_kernel_L2_M4_20: .Ldtrmm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40 ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5
dtrmm_kernel_L2_M4_22: .Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22 bgt .Ldtrmm_kernel_L2_M4_22
dtrmm_kernel_L2_M4_40: .Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100 ble .Ldtrmm_kernel_L2_M4_100
dtrmm_kernel_L2_M4_42: .Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42 bgt .Ldtrmm_kernel_L2_M4_42
dtrmm_kernel_L2_M4_100: .Ldtrmm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L2_M4_END: .Ldtrmm_kernel_L2_M4_END:
dtrmm_kernel_L2_M2_BEGIN: .Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN ble .Ldtrmm_kernel_L2_M1_BEGIN
dtrmm_kernel_L2_M2_20: .Ldtrmm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40 ble .Ldtrmm_kernel_L2_M2_40
dtrmm_kernel_L2_M2_22: .Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22 bgt .Ldtrmm_kernel_L2_M2_22
dtrmm_kernel_L2_M2_40: .Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100 ble .Ldtrmm_kernel_L2_M2_100
dtrmm_kernel_L2_M2_42: .Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42 bgt .Ldtrmm_kernel_L2_M2_42
dtrmm_kernel_L2_M2_100: .Ldtrmm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L2_M2_END: .Ldtrmm_kernel_L2_M2_END:
dtrmm_kernel_L2_M1_BEGIN: .Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END ble .Ldtrmm_kernel_L2_END
dtrmm_kernel_L2_M1_20: .Ldtrmm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40 ble .Ldtrmm_kernel_L2_M1_40
dtrmm_kernel_L2_M1_22: .Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22 bgt .Ldtrmm_kernel_L2_M1_22
dtrmm_kernel_L2_M1_40: .Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100 ble .Ldtrmm_kernel_L2_M1_100
dtrmm_kernel_L2_M1_42: .Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42 bgt .Ldtrmm_kernel_L2_M1_42
dtrmm_kernel_L2_M1_100: .Ldtrmm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L2_END: .Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END:
/******************************************************************************/ /******************************************************************************/
dtrmm_kernel_L1_BEGIN: .Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A
dtrmm_kernel_L1_M8_BEGIN: .Ldtrmm_kernel_L1_M8_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M4_BEGIN ble .Ldtrmm_kernel_L1_M4_BEGIN
dtrmm_kernel_L1_M8_20: .Ldtrmm_kernel_L1_M8_20:
INIT8x1 INIT8x1
@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M8_40 ble .Ldtrmm_kernel_L1_M8_40
.align 5 .align 5
dtrmm_kernel_L1_M8_22: .Ldtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_22 bgt .Ldtrmm_kernel_L1_M8_22
dtrmm_kernel_L1_M8_40: .Ldtrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M8_100 ble .Ldtrmm_kernel_L1_M8_100
dtrmm_kernel_L1_M8_42: .Ldtrmm_kernel_L1_M8_42:
KERNEL8x1_SUB KERNEL8x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_42 bgt .Ldtrmm_kernel_L1_M8_42
dtrmm_kernel_L1_M8_100: .Ldtrmm_kernel_L1_M8_100:
SAVE8x1 SAVE8x1
@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif
dtrmm_kernel_L1_M8_END: .Ldtrmm_kernel_L1_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M8_20 bgt .Ldtrmm_kernel_L1_M8_20
dtrmm_kernel_L1_M4_BEGIN: .Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L1_END ble .Ldtrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L1_M2_BEGIN ble .Ldtrmm_kernel_L1_M2_BEGIN
dtrmm_kernel_L1_M4_20: .Ldtrmm_kernel_L1_M4_20:
INIT4x1 INIT4x1
@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40 ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5
dtrmm_kernel_L1_M4_22: .Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22 bgt .Ldtrmm_kernel_L1_M4_22
dtrmm_kernel_L1_M4_40: .Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100 ble .Ldtrmm_kernel_L1_M4_100
dtrmm_kernel_L1_M4_42: .Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42 bgt .Ldtrmm_kernel_L1_M4_42
dtrmm_kernel_L1_M4_100: .Ldtrmm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
dtrmm_kernel_L1_M4_END: .Ldtrmm_kernel_L1_M4_END:
dtrmm_kernel_L1_M2_BEGIN: .Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN ble .Ldtrmm_kernel_L1_M1_BEGIN
dtrmm_kernel_L1_M2_20: .Ldtrmm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40 ble .Ldtrmm_kernel_L1_M2_40
dtrmm_kernel_L1_M2_22: .Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22 bgt .Ldtrmm_kernel_L1_M2_22
dtrmm_kernel_L1_M2_40: .Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100 ble .Ldtrmm_kernel_L1_M2_100
dtrmm_kernel_L1_M2_42: .Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42 bgt .Ldtrmm_kernel_L1_M2_42
dtrmm_kernel_L1_M2_100: .Ldtrmm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L1_M2_END: .Ldtrmm_kernel_L1_M2_END:
dtrmm_kernel_L1_M1_BEGIN: .Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END ble .Ldtrmm_kernel_L1_END
dtrmm_kernel_L1_M1_20: .Ldtrmm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40 ble .Ldtrmm_kernel_L1_M1_40
dtrmm_kernel_L1_M1_22: .Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22 bgt .Ldtrmm_kernel_L1_M1_22
dtrmm_kernel_L1_M1_40: .Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100 ble .Ldtrmm_kernel_L1_M1_100
dtrmm_kernel_L1_M1_42: .Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42 bgt .Ldtrmm_kernel_L1_M1_42
dtrmm_kernel_L1_M1_100: .Ldtrmm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
dtrmm_kernel_L1_END: .Ldtrmm_kernel_L1_END:
dtrmm_kernel_L999: .Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS
cmp N, xzr cmp N, xzr
ble gemv_n_kernel_L999 ble .Lgemv_n_kernel_L999
cmp M, xzr cmp M, xzr
ble gemv_n_kernel_L999 ble .Lgemv_n_kernel_L999
lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ lsl INC_X, INC_X, #SHZ
mov J, N mov J, N
cmp INC_Y, #1 cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN bne .Lgemv_n_kernel_S_BEGIN
gemv_n_kernel_F_LOOP: .Lgemv_n_kernel_F_LOOP:
ld1 TEMPV, [X], INC_X ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP fmul TEMP, ALPHA, TEMP
@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP:
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y
gemv_n_kernel_F32: .Lgemv_n_kernel_F32:
asr I, M, #5 asr I, M, #5
cmp I, xzr cmp I, xzr
beq gemv_n_kernel_F4 beq .Lgemv_n_kernel_F4
gemv_n_kernel_F320: .Lgemv_n_kernel_F320:
KERNEL_F16 KERNEL_F16
KERNEL_F16 KERNEL_F16
subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F320 bne .Lgemv_n_kernel_F320
gemv_n_kernel_F4: .Lgemv_n_kernel_F4:
ands I, M, #31 ands I, M, #31
asr I, I, #2 asr I, I, #2
cmp I, xzr cmp I, xzr
beq gemv_n_kernel_F1 beq .Lgemv_n_kernel_F1
gemv_n_kernel_F40: .Lgemv_n_kernel_F40:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F40 bne .Lgemv_n_kernel_F40
gemv_n_kernel_F1: .Lgemv_n_kernel_F1:
ands I, M, #3 ands I, M, #3
ble gemv_n_kernel_F_END ble .Lgemv_n_kernel_F_END
gemv_n_kernel_F10: .Lgemv_n_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F10 bne .Lgemv_n_kernel_F10
gemv_n_kernel_F_END: .Lgemv_n_kernel_F_END:
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne gemv_n_kernel_F_LOOP bne .Lgemv_n_kernel_F_LOOP
b gemv_n_kernel_L999 b .Lgemv_n_kernel_L999
gemv_n_kernel_S_BEGIN: .Lgemv_n_kernel_S_BEGIN:
INIT_S INIT_S
gemv_n_kernel_S_LOOP: .Lgemv_n_kernel_S_LOOP:
ld1 TEMPV, [X], INC_X ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP fmul TEMP, ALPHA, TEMP
@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP:
asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble gemv_n_kernel_S1 ble .Lgemv_n_kernel_S1
gemv_n_kernel_S4: .Lgemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -298,27 +298,27 @@ gemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_S4 bne .Lgemv_n_kernel_S4
gemv_n_kernel_S1: .Lgemv_n_kernel_S1:
ands I, M, #3 ands I, M, #3
ble gemv_n_kernel_S_END ble .Lgemv_n_kernel_S_END
gemv_n_kernel_S10: .Lgemv_n_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_S10 bne .Lgemv_n_kernel_S10
gemv_n_kernel_S_END: .Lgemv_n_kernel_S_END:
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne gemv_n_kernel_S_LOOP bne .Lgemv_n_kernel_S_LOOP
gemv_n_kernel_L999: .Lgemv_n_kernel_L999:
mov w0, wzr mov w0, wzr

View File

@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS
cmp N, xzr cmp N, xzr
ble gemv_t_kernel_L999 ble .Lgemv_t_kernel_L999
cmp M, xzr cmp M, xzr
ble gemv_t_kernel_L999 ble .Lgemv_t_kernel_L999
lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ lsl INC_Y, INC_Y, #SHZ
mov J, N mov J, N
cmp INC_X, #1 cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN bne .Lgemv_t_kernel_S_BEGIN
gemv_t_kernel_F_LOOP: .Lgemv_t_kernel_F_LOOP:
fmov TEMP, REG0 fmov TEMP, REG0
fmov TEMP1, REG0 fmov TEMP1, REG0
@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP:
mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X
gemv_t_kernel_F32: .Lgemv_t_kernel_F32:
asr I, M, #5 asr I, M, #5
cmp I, xzr cmp I, xzr
beq gemv_t_kernel_F4 beq .Lgemv_t_kernel_F4
gemv_t_kernel_F320: .Lgemv_t_kernel_F320:
KERNEL_F32 KERNEL_F32
subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F320 bne .Lgemv_t_kernel_F320
KERNEL_F32_FINALIZE KERNEL_F32_FINALIZE
gemv_t_kernel_F4: .Lgemv_t_kernel_F4:
ands I, M, #31 ands I, M, #31
asr I, I, #2 asr I, I, #2
cmp I, xzr cmp I, xzr
beq gemv_t_kernel_F1 beq .Lgemv_t_kernel_F1
gemv_t_kernel_F40: .Lgemv_t_kernel_F40:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F40 bne .Lgemv_t_kernel_F40
gemv_t_kernel_F1: .Lgemv_t_kernel_F1:
KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE
ands I, M, #3 ands I, M, #3
ble gemv_t_kernel_F_END ble .Lgemv_t_kernel_F_END
gemv_t_kernel_F10: .Lgemv_t_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F10 bne .Lgemv_t_kernel_F10
gemv_t_kernel_F_END: .Lgemv_t_kernel_F_END:
ld1 TMPV1, [Y] ld1 TMPV1, [Y]
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1 fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP bne .Lgemv_t_kernel_F_LOOP
b gemv_t_kernel_L999 b .Lgemv_t_kernel_L999
gemv_t_kernel_S_BEGIN: .Lgemv_t_kernel_S_BEGIN:
INIT_S INIT_S
gemv_t_kernel_S_LOOP: .Lgemv_t_kernel_S_LOOP:
fmov TEMP, REG0 fmov TEMP, REG0
mov A_PTR, A mov A_PTR, A
@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP:
asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble gemv_t_kernel_S1 ble .Lgemv_t_kernel_S1
gemv_t_kernel_S4: .Lgemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -329,30 +329,30 @@ gemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_S4 bne .Lgemv_t_kernel_S4
gemv_t_kernel_S1: .Lgemv_t_kernel_S1:
ands I, M, #3 ands I, M, #3
ble gemv_t_kernel_S_END ble .Lgemv_t_kernel_S_END
gemv_t_kernel_S10: .Lgemv_t_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_S10 bne .Lgemv_t_kernel_S10
gemv_t_kernel_S_END: .Lgemv_t_kernel_S_END:
ld1 TMPV1, [Y] ld1 TMPV1, [Y]
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1 fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP bne .Lgemv_t_kernel_S_LOOP
gemv_t_kernel_L999: .Lgemv_t_kernel_L999:
RESTORE_REGS RESTORE_REGS

View File

@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble iamax_kernel_zero ble .Liamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble iamax_kernel_zero ble .Liamax_kernel_zero
cmp INC_X, #1 cmp INC_X, #1
bne iamax_kernel_S_BEGIN bne .Liamax_kernel_S_BEGIN
mov x7, X mov x7, X
iamax_kernel_F_BEGIN: .Liamax_kernel_F_BEGIN:
INIT_S INIT_S
subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999 ble .Liamax_kernel_L999
asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq iamax_kernel_F1 beq .Liamax_kernel_F1
add Z, Z, #1 add Z, Z, #1
iamax_kernel_F8: .Liamax_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne iamax_kernel_F8 bne .Liamax_kernel_F8
KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE
sub Z, Z, #1 sub Z, Z, #1
iamax_kernel_F1: .Liamax_kernel_F1:
ands I, N, #7 ands I, N, #7
ble iamax_kernel_L999 ble .Liamax_kernel_L999
iamax_kernel_F10: .Liamax_kernel_F10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_F10 bne .Liamax_kernel_F10
b iamax_kernel_L999 b .Liamax_kernel_L999
iamax_kernel_S_BEGIN: .Liamax_kernel_S_BEGIN:
INIT_S INIT_S
subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999 ble .Liamax_kernel_L999
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble iamax_kernel_S1 ble .Liamax_kernel_S1
iamax_kernel_S4: .Liamax_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -293,25 +293,25 @@ iamax_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_S4 bne .Liamax_kernel_S4
iamax_kernel_S1: .Liamax_kernel_S1:
ands I, N, #3 ands I, N, #3
ble iamax_kernel_L999 ble .Liamax_kernel_L999
iamax_kernel_S10: .Liamax_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_S10 bne .Liamax_kernel_S10
iamax_kernel_L999: .Liamax_kernel_L999:
mov x0, INDEX mov x0, INDEX
ret ret
iamax_kernel_zero: .Liamax_kernel_zero:
mov x0, xzr mov x0, xzr
ret ret

View File

@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble iamax_kernel_zero ble .Lizamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble iamax_kernel_zero ble .Lizamax_kernel_zero
cmp INC_X, #1 cmp INC_X, #1
bne iamax_kernel_S_BEGIN bne .Lizamax_kernel_S_BEGIN
mov x7, X mov x7, X
iamax_kernel_F_BEGIN: .Lizamax_kernel_F_BEGIN:
INIT_S INIT_S
subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999 ble .Lizamax_kernel_L999
asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
ble iamax_kernel_F1 ble .Lizamax_kernel_F1
add Z, Z, #1 add Z, Z, #1
iamax_kernel_F8: .Lizamax_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne iamax_kernel_F8 bne .Lizamax_kernel_F8
KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE
sub Z, Z, #1 sub Z, Z, #1
iamax_kernel_F1: .Lizamax_kernel_F1:
ands I, N, #7 ands I, N, #7
ble iamax_kernel_L999 ble .Lizamax_kernel_L999
iamax_kernel_F10: .Lizamax_kernel_F10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_F10 bne .Lizamax_kernel_F10
b iamax_kernel_L999 b .Lizamax_kernel_L999
iamax_kernel_S_BEGIN: .Lizamax_kernel_S_BEGIN:
INIT_S INIT_S
subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999 ble .Lizamax_kernel_L999
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble iamax_kernel_S1 ble .Lizamax_kernel_S1
iamax_kernel_S4: .Lizamax_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -341,26 +341,26 @@ iamax_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_S4 bne .Lizamax_kernel_S4
iamax_kernel_S1: .Lizamax_kernel_S1:
ands I, N, #3 ands I, N, #3
ble iamax_kernel_L999 ble .Lizamax_kernel_L999
iamax_kernel_S10: .Lizamax_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_S10 bne .Lizamax_kernel_S10
iamax_kernel_L999: .Lizamax_kernel_L999:
mov x0, INDEX mov x0, INDEX
ret ret
iamax_kernel_zero: .Lizamax_kernel_zero:
mov x0, xzr mov x0, xzr
ret ret

View File

@ -162,44 +162,44 @@ KERNEL_S1_NEXT:
INIT INIT
cmp N, #0 cmp N, #0
ble nrm2_kernel_L999 ble .Lnrm2_kernel_L999
cmp INC_X, #0 cmp INC_X, #0
beq nrm2_kernel_L999 beq .Lnrm2_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne nrm2_kernel_S_BEGIN bne .Lnrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN: .Lnrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, xzr cmp I, xzr
ble nrm2_kernel_F1 ble .Lnrm2_kernel_F1
nrm2_kernel_F8: .Lnrm2_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F8 bne .Lnrm2_kernel_F8
nrm2_kernel_F1: .Lnrm2_kernel_F1:
ands I, N, #7 ands I, N, #7
ble nrm2_kernel_L999 ble .Lnrm2_kernel_L999
nrm2_kernel_F10: .Lnrm2_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F10 bne .Lnrm2_kernel_F10
b nrm2_kernel_L999 b .Lnrm2_kernel_L999
nrm2_kernel_S_BEGIN: .Lnrm2_kernel_S_BEGIN:
INIT_S INIT_S
@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN:
.align 5 .align 5
nrm2_kernel_S10: .Lnrm2_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne nrm2_kernel_S10 bne .Lnrm2_kernel_S10
nrm2_kernel_L999: .Lnrm2_kernel_L999:
fsqrt SSQ, SSQ fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ fmul SSQ, SCALE, SSQ

View File

@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble rot_kernel_L999 ble .Lrot_kernel_L999
INIT INIT
cmp INC_X, #1 cmp INC_X, #1
bne rot_kernel_S_BEGIN bne .Lrot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne rot_kernel_S_BEGIN bne .Lrot_kernel_S_BEGIN
rot_kernel_F_BEGIN: .Lrot_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq rot_kernel_F1 beq .Lrot_kernel_F1
KERNEL_INIT_F4 KERNEL_INIT_F4
rot_kernel_F4: .Lrot_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne rot_kernel_F4 bne .Lrot_kernel_F4
rot_kernel_F1: .Lrot_kernel_F1:
ands I, N, #3 ands I, N, #3
ble rot_kernel_L999 ble .Lrot_kernel_L999
INIT_F1 INIT_F1
rot_kernel_F10: .Lrot_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne rot_kernel_F10 bne .Lrot_kernel_F10
mov w0, wzr mov w0, wzr
ret ret
rot_kernel_S_BEGIN: .Lrot_kernel_S_BEGIN:
INIT_S INIT_S
INIT_F1 INIT_F1
@ -214,9 +214,9 @@ rot_kernel_S_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble rot_kernel_S1 ble .Lrot_kernel_S1
rot_kernel_S4: .Lrot_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -224,22 +224,22 @@ rot_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne rot_kernel_S4 bne .Lrot_kernel_S4
rot_kernel_S1: .Lrot_kernel_S1:
ands I, N, #3 ands I, N, #3
ble rot_kernel_L999 ble .Lrot_kernel_L999
rot_kernel_S10: .Lrot_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne rot_kernel_S10 bne .Lrot_kernel_S10
rot_kernel_L999: .Lrot_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret

View File

@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble scal_kernel_L999 ble .Lscal_kernel_L999
fcmp DA, #0.0 fcmp DA, #0.0
beq scal_kernel_zero beq .Lscal_kernel_zero
cmp INC_X, #1 cmp INC_X, #1
bne scal_kernel_S_BEGIN bne .Lscal_kernel_S_BEGIN
scal_kernel_F_BEGIN: .Lscal_kernel_F_BEGIN:
asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq scal_kernel_F1 beq .Lscal_kernel_F1
KERNEL_INIT_F8 KERNEL_INIT_F8
scal_kernel_F8: .Lscal_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne scal_kernel_F8 bne .Lscal_kernel_F8
scal_kernel_F1: .Lscal_kernel_F1:
ands I, N, #7 ands I, N, #7
ble scal_kernel_L999 ble .Lscal_kernel_L999
scal_kernel_F10: .Lscal_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne scal_kernel_F10 bne .Lscal_kernel_F10
mov w0, wzr mov w0, wzr
ret ret
scal_kernel_S_BEGIN: .Lscal_kernel_S_BEGIN:
INIT_S INIT_S
mov X_COPY, X mov X_COPY, X
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble scal_kernel_S1 ble .Lscal_kernel_S1
scal_kernel_S4: .Lscal_kernel_S4:
KERNEL_S4 KERNEL_S4
subs I, I, #1 subs I, I, #1
bne scal_kernel_S4 bne .Lscal_kernel_S4
scal_kernel_S1: .Lscal_kernel_S1:
ands I, N, #3 ands I, N, #3
ble scal_kernel_L999 ble .Lscal_kernel_L999
scal_kernel_S10: .Lscal_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne scal_kernel_S10 bne .Lscal_kernel_S10
scal_kernel_L999: .Lscal_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret
scal_kernel_zero: .Lscal_kernel_zero:
INIT_S INIT_S
scal_kernel_Z1: .Lscal_kernel_Z1:
st1 DAV, [X], INC_X st1 DAV, [X], INC_X
subs N, N, #1 subs N, N, #1
bne scal_kernel_Z1 bne .Lscal_kernel_Z1
mov w0, wzr mov w0, wzr
ret ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble sgemm_kernel_L2_BEGIN ble .Lsgemm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
sgemm_kernel_L4_BEGIN: .Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN:
add pA_2, temp, pA_1 add pA_2, temp, pA_1
add pA_3, temp, pA_2 add pA_3, temp, pA_2
sgemm_kernel_L4_M16_BEGIN: .Lsgemm_kernel_L4_M16_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16 asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0 cmp counterI, #0
ble sgemm_kernel_L4_M8_BEGIN ble .Lsgemm_kernel_L4_M8_BEGIN
sgemm_kernel_L4_M16_20: .Lsgemm_kernel_L4_M16_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt sgemm_kernel_L4_M16_32 blt .Lsgemm_kernel_L4_M16_32
KERNEL16x4_I // do one in the K KERNEL16x4_I // do one in the K
KERNEL16x4_M2 // do another in the K KERNEL16x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble sgemm_kernel_L4_M16_22a ble .Lsgemm_kernel_L4_M16_22a
.align 5 .align 5
sgemm_kernel_L4_M16_22: .Lsgemm_kernel_L4_M16_22:
KERNEL16x4_M1 KERNEL16x4_M1
KERNEL16x4_M2 KERNEL16x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M16_22 bgt .Lsgemm_kernel_L4_M16_22
sgemm_kernel_L4_M16_22a: .Lsgemm_kernel_L4_M16_22a:
KERNEL16x4_M1 KERNEL16x4_M1
KERNEL16x4_E KERNEL16x4_E
b sgemm_kernel_L4_M16_44 b .Lsgemm_kernel_L4_M16_44
sgemm_kernel_L4_M16_32: .Lsgemm_kernel_L4_M16_32:
tst counterL, #1 tst counterL, #1
ble sgemm_kernel_L4_M16_40 ble .Lsgemm_kernel_L4_M16_40
KERNEL16x4_I KERNEL16x4_I
KERNEL16x4_E KERNEL16x4_E
b sgemm_kernel_L4_M16_44 b .Lsgemm_kernel_L4_M16_44
sgemm_kernel_L4_M16_40: .Lsgemm_kernel_L4_M16_40:
INIT16x4 INIT16x4
sgemm_kernel_L4_M16_44: .Lsgemm_kernel_L4_M16_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble sgemm_kernel_L4_M16_100 ble .Lsgemm_kernel_L4_M16_100
sgemm_kernel_L4_M16_46: .Lsgemm_kernel_L4_M16_46:
KERNEL16x4_SUB KERNEL16x4_SUB
sgemm_kernel_L4_M16_100: .Lsgemm_kernel_L4_M16_100:
SAVE16x4 SAVE16x4
sgemm_kernel_L4_M16_END: .Lsgemm_kernel_L4_M16_END:
lsl temp, origK, #4 // k * 4 * 4 = Four rows of A lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
add pA_0, pA_0, temp add pA_0, pA_0, temp
add pA_0, pA_0, temp add pA_0, pA_0, temp
@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END:
add pA_2, pA_1, temp add pA_2, pA_1, temp
add pA_3, pA_2, temp add pA_3, pA_2, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne sgemm_kernel_L4_M16_20 bne .Lsgemm_kernel_L4_M16_20
sgemm_kernel_L4_M8_BEGIN: .Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #15 tst counterI , #15
ble sgemm_kernel_L4_END ble .Lsgemm_kernel_L4_END
tst counterI, #8 tst counterI, #8
ble sgemm_kernel_L4_M4_BEGIN ble .Lsgemm_kernel_L4_M4_BEGIN
sgemm_kernel_L4_M8_20: .Lsgemm_kernel_L4_M8_20:
INIT8x4 INIT8x4
mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L4_M8_40 ble .Lsgemm_kernel_L4_M8_40
sgemm_kernel_L4_M8_22: .Lsgemm_kernel_L4_M8_22:
KERNEL8x4_SUB KERNEL8x4_SUB
KERNEL8x4_SUB KERNEL8x4_SUB
@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_22 bgt .Lsgemm_kernel_L4_M8_22
sgemm_kernel_L4_M8_40: .Lsgemm_kernel_L4_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M8_100 ble .Lsgemm_kernel_L4_M8_100
sgemm_kernel_L4_M8_42: .Lsgemm_kernel_L4_M8_42:
KERNEL8x4_SUB KERNEL8x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_42 bgt .Lsgemm_kernel_L4_M8_42
sgemm_kernel_L4_M8_100: .Lsgemm_kernel_L4_M8_100:
SAVE8x4 SAVE8x4
sgemm_kernel_L4_M8_END: .Lsgemm_kernel_L4_M8_END:
lsl temp, origK, #4 // k * 4 * 4 lsl temp, origK, #4 // k * 4 * 4
add pA_0, pA_0, temp add pA_0, pA_0, temp
sgemm_kernel_L4_M4_BEGIN: .Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble sgemm_kernel_L4_END ble .Lsgemm_kernel_L4_END
tst counterI, #4 tst counterI, #4
ble sgemm_kernel_L4_M2_BEGIN ble .Lsgemm_kernel_L4_M2_BEGIN
sgemm_kernel_L4_M4_20: .Lsgemm_kernel_L4_M4_20:
INIT4x4 INIT4x4
mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L4_M4_40 ble .Lsgemm_kernel_L4_M4_40
sgemm_kernel_L4_M4_22: .Lsgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_22 bgt .Lsgemm_kernel_L4_M4_22
sgemm_kernel_L4_M4_40: .Lsgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M4_100 ble .Lsgemm_kernel_L4_M4_100
sgemm_kernel_L4_M4_42: .Lsgemm_kernel_L4_M4_42:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_42 bgt .Lsgemm_kernel_L4_M4_42
sgemm_kernel_L4_M4_100: .Lsgemm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
sgemm_kernel_L4_M4_END: .Lsgemm_kernel_L4_M4_END:
sgemm_kernel_L4_M2_BEGIN: .Lsgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L4_END ble .Lsgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L4_M1_BEGIN ble .Lsgemm_kernel_L4_M1_BEGIN
sgemm_kernel_L4_M2_20: .Lsgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L4_M2_40 ble .Lsgemm_kernel_L4_M2_40
sgemm_kernel_L4_M2_22: .Lsgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_22 bgt .Lsgemm_kernel_L4_M2_22
sgemm_kernel_L4_M2_40: .Lsgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M2_100 ble .Lsgemm_kernel_L4_M2_100
sgemm_kernel_L4_M2_42: .Lsgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_42 bgt .Lsgemm_kernel_L4_M2_42
sgemm_kernel_L4_M2_100: .Lsgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
sgemm_kernel_L4_M2_END: .Lsgemm_kernel_L4_M2_END:
sgemm_kernel_L4_M1_BEGIN: .Lsgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L4_END ble .Lsgemm_kernel_L4_END
sgemm_kernel_L4_M1_20: .Lsgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L4_M1_40 ble .Lsgemm_kernel_L4_M1_40
sgemm_kernel_L4_M1_22: .Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_22 bgt .Lsgemm_kernel_L4_M1_22
sgemm_kernel_L4_M1_40: .Lsgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M1_100 ble .Lsgemm_kernel_L4_M1_100
sgemm_kernel_L4_M1_42: .Lsgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_42 bgt .Lsgemm_kernel_L4_M1_42
sgemm_kernel_L4_M1_100: .Lsgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
sgemm_kernel_L4_END: .Lsgemm_kernel_L4_END:
lsl temp, origK, #4 lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4 add origPB, origPB, temp // B = B + K * 4 * 4
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt sgemm_kernel_L4_BEGIN bgt .Lsgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
sgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble sgemm_kernel_L999 ble .Lsgemm_kernel_L999
tst counterJ , #2 tst counterJ , #2
ble sgemm_kernel_L1_BEGIN ble .Lsgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
sgemm_kernel_L2_M4_BEGIN: .Lsgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble sgemm_kernel_L2_M2_BEGIN ble .Lsgemm_kernel_L2_M2_BEGIN
sgemm_kernel_L2_M4_20: .Lsgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble sgemm_kernel_L2_M4_40 ble .Lsgemm_kernel_L2_M4_40
.align 5 .align 5
sgemm_kernel_L2_M4_22: .Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_22 bgt .Lsgemm_kernel_L2_M4_22
sgemm_kernel_L2_M4_40: .Lsgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M4_100 ble .Lsgemm_kernel_L2_M4_100
sgemm_kernel_L2_M4_42: .Lsgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_42 bgt .Lsgemm_kernel_L2_M4_42
sgemm_kernel_L2_M4_100: .Lsgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
sgemm_kernel_L2_M4_END: .Lsgemm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt sgemm_kernel_L2_M4_20 bgt .Lsgemm_kernel_L2_M4_20
sgemm_kernel_L2_M2_BEGIN: .Lsgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L2_END ble .Lsgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L2_M1_BEGIN ble .Lsgemm_kernel_L2_M1_BEGIN
sgemm_kernel_L2_M2_20: .Lsgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble sgemm_kernel_L2_M2_40 ble .Lsgemm_kernel_L2_M2_40
sgemm_kernel_L2_M2_22: .Lsgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_22 bgt .Lsgemm_kernel_L2_M2_22
sgemm_kernel_L2_M2_40: .Lsgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M2_100 ble .Lsgemm_kernel_L2_M2_100
sgemm_kernel_L2_M2_42: .Lsgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_42 bgt .Lsgemm_kernel_L2_M2_42
sgemm_kernel_L2_M2_100: .Lsgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
sgemm_kernel_L2_M2_END: .Lsgemm_kernel_L2_M2_END:
sgemm_kernel_L2_M1_BEGIN: .Lsgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L2_END ble .Lsgemm_kernel_L2_END
sgemm_kernel_L2_M1_20: .Lsgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L2_M1_40 ble .Lsgemm_kernel_L2_M1_40
sgemm_kernel_L2_M1_22: .Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_22 bgt .Lsgemm_kernel_L2_M1_22
sgemm_kernel_L2_M1_40: .Lsgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M1_100 ble .Lsgemm_kernel_L2_M1_100
sgemm_kernel_L2_M1_42: .Lsgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_42 bgt .Lsgemm_kernel_L2_M1_42
sgemm_kernel_L2_M1_100: .Lsgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
sgemm_kernel_L2_END: .Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/ /******************************************************************************/
sgemm_kernel_L1_BEGIN: .Lsgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble sgemm_kernel_L999 // done ble .Lsgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN:
sgemm_kernel_L1_M4_BEGIN: .Lsgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble sgemm_kernel_L1_M2_BEGIN ble .Lsgemm_kernel_L1_M2_BEGIN
sgemm_kernel_L1_M4_20: .Lsgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M4_40 ble .Lsgemm_kernel_L1_M4_40
.align 5 .align 5
sgemm_kernel_L1_M4_22: .Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_22 bgt .Lsgemm_kernel_L1_M4_22
sgemm_kernel_L1_M4_40: .Lsgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M4_100 ble .Lsgemm_kernel_L1_M4_100
sgemm_kernel_L1_M4_42: .Lsgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_42 bgt .Lsgemm_kernel_L1_M4_42
sgemm_kernel_L1_M4_100: .Lsgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
sgemm_kernel_L1_M4_END: .Lsgemm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt sgemm_kernel_L1_M4_20 bgt .Lsgemm_kernel_L1_M4_20
sgemm_kernel_L1_M2_BEGIN: .Lsgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L1_END ble .Lsgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L1_M1_BEGIN ble .Lsgemm_kernel_L1_M1_BEGIN
sgemm_kernel_L1_M2_20: .Lsgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M2_40 ble .Lsgemm_kernel_L1_M2_40
sgemm_kernel_L1_M2_22: .Lsgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_22 bgt .Lsgemm_kernel_L1_M2_22
sgemm_kernel_L1_M2_40: .Lsgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M2_100 ble .Lsgemm_kernel_L1_M2_100
sgemm_kernel_L1_M2_42: .Lsgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_42 bgt .Lsgemm_kernel_L1_M2_42
sgemm_kernel_L1_M2_100: .Lsgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
sgemm_kernel_L1_M2_END: .Lsgemm_kernel_L1_M2_END:
sgemm_kernel_L1_M1_BEGIN: .Lsgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L1_END ble .Lsgemm_kernel_L1_END
sgemm_kernel_L1_M1_20: .Lsgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M1_40 ble .Lsgemm_kernel_L1_M1_40
sgemm_kernel_L1_M1_22: .Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_22 bgt .Lsgemm_kernel_L1_M1_22
sgemm_kernel_L1_M1_40: .Lsgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M1_100 ble .Lsgemm_kernel_L1_M1_100
sgemm_kernel_L1_M1_42: .Lsgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_42 bgt .Lsgemm_kernel_L1_M1_42
sgemm_kernel_L1_M1_100: .Lsgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
sgemm_kernel_L1_END: .Lsgemm_kernel_L1_END:
sgemm_kernel_L999: .Lsgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
strmm_kernel_begin: .Lstrmm_kernel_begin:
.align 5 .align 5
add sp, sp, #-(11 * 16) add sp, sp, #-(11 * 16)
@ -539,11 +539,11 @@ strmm_kernel_begin:
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble strmm_kernel_L2_BEGIN ble .Lstrmm_kernel_L2_BEGIN
/******************************************************************************/ /******************************************************************************/
strmm_kernel_L4_BEGIN: .Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
strmm_kernel_L4_M4_BEGIN: .Lstrmm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble strmm_kernel_L4_M2_BEGIN ble .Lstrmm_kernel_L4_M2_BEGIN
strmm_kernel_L4_M4_20: .Lstrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20:
asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt strmm_kernel_L4_M4_32 blt .Lstrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2 subs counterL, counterL, #2
ble strmm_kernel_L4_M4_22a ble .Lstrmm_kernel_L4_M4_22a
.align 5 .align 5
strmm_kernel_L4_M4_22: .Lstrmm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M4_22 bgt .Lstrmm_kernel_L4_M4_22
strmm_kernel_L4_M4_22a: .Lstrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b strmm_kernel_L4_M4_44 b .Lstrmm_kernel_L4_M4_44
strmm_kernel_L4_M4_32: .Lstrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble strmm_kernel_L4_M4_40 ble .Lstrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b strmm_kernel_L4_M4_44 b .Lstrmm_kernel_L4_M4_44
strmm_kernel_L4_M4_40: .Lstrmm_kernel_L4_M4_40:
INIT4x4 INIT4x4
strmm_kernel_L4_M4_44: .Lstrmm_kernel_L4_M4_44:
ands counterL , tempK, #1 ands counterL , tempK, #1
ble strmm_kernel_L4_M4_100 ble .Lstrmm_kernel_L4_M4_100
strmm_kernel_L4_M4_46: .Lstrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
strmm_kernel_L4_M4_100: .Lstrmm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
strmm_kernel_L4_M4_END: .Lstrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne strmm_kernel_L4_M4_20 bne .Lstrmm_kernel_L4_M4_20
strmm_kernel_L4_M2_BEGIN: .Lstrmm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L4_END ble .Lstrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L4_M1_BEGIN ble .Lstrmm_kernel_L4_M1_BEGIN
strmm_kernel_L4_M2_20: .Lstrmm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L4_M2_40 ble .Lstrmm_kernel_L4_M2_40
strmm_kernel_L4_M2_22: .Lstrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_22 bgt .Lstrmm_kernel_L4_M2_22
strmm_kernel_L4_M2_40: .Lstrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M2_100 ble .Lstrmm_kernel_L4_M2_100
strmm_kernel_L4_M2_42: .Lstrmm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_42 bgt .Lstrmm_kernel_L4_M2_42
strmm_kernel_L4_M2_100: .Lstrmm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100:
#endif #endif
strmm_kernel_L4_M2_END: .Lstrmm_kernel_L4_M2_END:
strmm_kernel_L4_M1_BEGIN: .Lstrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L4_END ble .Lstrmm_kernel_L4_END
strmm_kernel_L4_M1_20: .Lstrmm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L4_M1_40 ble .Lstrmm_kernel_L4_M1_40
strmm_kernel_L4_M1_22: .Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_22 bgt .Lstrmm_kernel_L4_M1_22
strmm_kernel_L4_M1_40: .Lstrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M1_100 ble .Lstrmm_kernel_L4_M1_100
strmm_kernel_L4_M1_42: .Lstrmm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_42 bgt .Lstrmm_kernel_L4_M1_42
strmm_kernel_L4_M1_100: .Lstrmm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100:
#endif #endif
strmm_kernel_L4_END: .Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
#if !defined(LEFT) #if !defined(LEFT)
@ -825,19 +825,19 @@ strmm_kernel_L4_END:
#endif #endif
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt strmm_kernel_L4_BEGIN bgt .Lstrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
strmm_kernel_L2_BEGIN: // less than 2 left in N direction .Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble strmm_kernel_L999 ble .Lstrmm_kernel_L999
tst counterJ , #2 tst counterJ , #2
ble strmm_kernel_L1_BEGIN ble .Lstrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
strmm_kernel_L2_M4_BEGIN: .Lstrmm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble strmm_kernel_L2_M2_BEGIN ble .Lstrmm_kernel_L2_M2_BEGIN
strmm_kernel_L2_M4_20: .Lstrmm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble strmm_kernel_L2_M4_40 ble .Lstrmm_kernel_L2_M4_40
.align 5 .align 5
strmm_kernel_L2_M4_22: .Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_22 bgt .Lstrmm_kernel_L2_M4_22
strmm_kernel_L2_M4_40: .Lstrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M4_100 ble .Lstrmm_kernel_L2_M4_100
strmm_kernel_L2_M4_42: .Lstrmm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_42 bgt .Lstrmm_kernel_L2_M4_42
strmm_kernel_L2_M4_100: .Lstrmm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
strmm_kernel_L2_M4_END: .Lstrmm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt strmm_kernel_L2_M4_20 bgt .Lstrmm_kernel_L2_M4_20
strmm_kernel_L2_M2_BEGIN: .Lstrmm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L2_END ble .Lstrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L2_M1_BEGIN ble .Lstrmm_kernel_L2_M1_BEGIN
strmm_kernel_L2_M2_20: .Lstrmm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble strmm_kernel_L2_M2_40 ble .Lstrmm_kernel_L2_M2_40
strmm_kernel_L2_M2_22: .Lstrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_22 bgt .Lstrmm_kernel_L2_M2_22
strmm_kernel_L2_M2_40: .Lstrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M2_100 ble .Lstrmm_kernel_L2_M2_100
strmm_kernel_L2_M2_42: .Lstrmm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_42 bgt .Lstrmm_kernel_L2_M2_42
strmm_kernel_L2_M2_100: .Lstrmm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
strmm_kernel_L2_M2_END: .Lstrmm_kernel_L2_M2_END:
strmm_kernel_L2_M1_BEGIN: .Lstrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L2_END ble .Lstrmm_kernel_L2_END
strmm_kernel_L2_M1_20: .Lstrmm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble strmm_kernel_L2_M1_40 ble .Lstrmm_kernel_L2_M1_40
strmm_kernel_L2_M1_22: .Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_22 bgt .Lstrmm_kernel_L2_M1_22
strmm_kernel_L2_M1_40: .Lstrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M1_100 ble .Lstrmm_kernel_L2_M1_100
strmm_kernel_L2_M1_42: .Lstrmm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_42 bgt .Lstrmm_kernel_L2_M1_42
strmm_kernel_L2_M1_100: .Lstrmm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
strmm_kernel_L2_END: .Lstrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@ -1107,11 +1107,11 @@ strmm_kernel_L2_END:
/******************************************************************************/ /******************************************************************************/
strmm_kernel_L1_BEGIN: .Lstrmm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble strmm_kernel_L999 // done ble .Lstrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A
strmm_kernel_L1_M4_BEGIN: .Lstrmm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble strmm_kernel_L1_M2_BEGIN ble .Lstrmm_kernel_L1_M2_BEGIN
strmm_kernel_L1_M4_20: .Lstrmm_kernel_L1_M4_20:
INIT4x1 INIT4x1
@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M4_40 ble .Lstrmm_kernel_L1_M4_40
.align 5 .align 5
strmm_kernel_L1_M4_22: .Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_22 bgt .Lstrmm_kernel_L1_M4_22
strmm_kernel_L1_M4_40: .Lstrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M4_100 ble .Lstrmm_kernel_L1_M4_100
strmm_kernel_L1_M4_42: .Lstrmm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_42 bgt .Lstrmm_kernel_L1_M4_42
strmm_kernel_L1_M4_100: .Lstrmm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
strmm_kernel_L1_M4_END: .Lstrmm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt strmm_kernel_L1_M4_20 bgt .Lstrmm_kernel_L1_M4_20
strmm_kernel_L1_M2_BEGIN: .Lstrmm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L1_END ble .Lstrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L1_M1_BEGIN ble .Lstrmm_kernel_L1_M1_BEGIN
strmm_kernel_L1_M2_20: .Lstrmm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M2_40 ble .Lstrmm_kernel_L1_M2_40
strmm_kernel_L1_M2_22: .Lstrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_22 bgt .Lstrmm_kernel_L1_M2_22
strmm_kernel_L1_M2_40: .Lstrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M2_100 ble .Lstrmm_kernel_L1_M2_100
strmm_kernel_L1_M2_42: .Lstrmm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_42 bgt .Lstrmm_kernel_L1_M2_42
strmm_kernel_L1_M2_100: .Lstrmm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100:
#endif #endif
strmm_kernel_L1_M2_END: .Lstrmm_kernel_L1_M2_END:
strmm_kernel_L1_M1_BEGIN: .Lstrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L1_END ble .Lstrmm_kernel_L1_END
strmm_kernel_L1_M1_20: .Lstrmm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M1_40 ble .Lstrmm_kernel_L1_M1_40
strmm_kernel_L1_M1_22: .Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_22 bgt .Lstrmm_kernel_L1_M1_22
strmm_kernel_L1_M1_40: .Lstrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M1_100 ble .Lstrmm_kernel_L1_M1_100
strmm_kernel_L1_M1_42: .Lstrmm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_42 bgt .Lstrmm_kernel_L1_M1_42
strmm_kernel_L1_M1_100: .Lstrmm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100:
#endif #endif
#endif #endif
strmm_kernel_L1_END: .Lstrmm_kernel_L1_END:
#if 0 #if 0
#if !defined(LEFT) #if !defined(LEFT)
@ -1385,7 +1385,7 @@ strmm_kernel_L1_END:
#endif #endif
#endif #endif
strmm_kernel_L999: .Lstrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

File diff suppressed because it is too large Load Diff

View File

@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble swap_kernel_L999 ble .Lswap_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne swap_kernel_S_BEGIN bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne swap_kernel_S_BEGIN bne .Lswap_kernel_S_BEGIN
swap_kernel_F_BEGIN: .Lswap_kernel_F_BEGIN:
asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq swap_kernel_F1 beq .Lswap_kernel_F1
swap_kernel_F8: .Lswap_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne swap_kernel_F8 bne .Lswap_kernel_F8
swap_kernel_F1: .Lswap_kernel_F1:
ands I, N, #7 ands I, N, #7
ble swap_kernel_L999 ble .Lswap_kernel_L999
swap_kernel_F10: .Lswap_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne swap_kernel_F10 bne .Lswap_kernel_F10
b swap_kernel_L999 b .Lswap_kernel_L999
swap_kernel_S_BEGIN: .Lswap_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble swap_kernel_S1 ble .Lswap_kernel_S1
swap_kernel_S4: .Lswap_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -244,21 +244,21 @@ swap_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne swap_kernel_S4 bne .Lswap_kernel_S4
swap_kernel_S1: .Lswap_kernel_S1:
ands I, N, #3 ands I, N, #3
ble swap_kernel_L999 ble .Lswap_kernel_L999
swap_kernel_S10: .Lswap_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne swap_kernel_S10 bne .Lswap_kernel_S10
swap_kernel_L999: .Lswap_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret

View File

@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble amax_kernel_zero ble .Lzamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble amax_kernel_zero ble .Lzamax_kernel_zero
cmp INC_X, #1 cmp INC_X, #1
bne amax_kernel_S_BEGIN bne .Lzamax_kernel_S_BEGIN
amax_kernel_F_BEGIN: .Lzamax_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq amax_kernel_F1_INIT beq .Lzamax_kernel_F1_INIT
INIT_F4 INIT_F4
subs I, I, #1 subs I, I, #1
beq amax_kernel_F1 beq .Lzamax_kernel_F1
amax_kernel_F4: .Lzamax_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne amax_kernel_F4 bne .Lzamax_kernel_F4
amax_kernel_F1: .Lzamax_kernel_F1:
ands I, N, #3 ands I, N, #3
ble amax_kernel_L999 ble .Lzamax_kernel_L999
amax_kernel_F10: .Lzamax_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne amax_kernel_F10 bne .Lzamax_kernel_F10
ret ret
amax_kernel_F1_INIT: .Lzamax_kernel_F1_INIT:
INIT_F1 INIT_F1
subs N, N, #1 subs N, N, #1
b amax_kernel_F1 b .Lzamax_kernel_F1
amax_kernel_S_BEGIN: .Lzamax_kernel_S_BEGIN:
INIT_S INIT_S
subs N, N, #1 subs N, N, #1
ble amax_kernel_L999 ble .Lzamax_kernel_L999
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble amax_kernel_S1 ble .Lzamax_kernel_S1
amax_kernel_S4: .Lzamax_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -247,25 +247,25 @@ amax_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne amax_kernel_S4 bne .Lzamax_kernel_S4
amax_kernel_S1: .Lzamax_kernel_S1:
ands I, N, #3 ands I, N, #3
ble amax_kernel_L999 ble .Lzamax_kernel_L999
amax_kernel_S10: .Lzamax_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne amax_kernel_S10 bne .Lzamax_kernel_S10
amax_kernel_L999: .Lzamax_kernel_L999:
ret ret
amax_kernel_zero: .Lzamax_kernel_zero:
fmov MAXF, REG0 fmov MAXF, REG0
ret ret

View File

@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov SUMF, REG0 fmov SUMF, REG0
cmp N, xzr cmp N, xzr
ble asum_kernel_L999 ble .Lzasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999 ble .Lzasum_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN bne .Lzasum_kernel_S_BEGIN
asum_kernel_F_BEGIN: .Lzasum_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq asum_kernel_F1 beq .Lzasum_kernel_F1
asum_kernel_F4: .Lzasum_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne asum_kernel_F4 bne .Lzasum_kernel_F4
KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE
asum_kernel_F1: .Lzasum_kernel_F1:
ands I, N, #3 ands I, N, #3
ble asum_kernel_L999 ble .Lzasum_kernel_L999
asum_kernel_F10: .Lzasum_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne asum_kernel_F10 bne .Lzasum_kernel_F10
asum_kernel_L999: .Lzasum_kernel_L999:
ret ret
asum_kernel_S_BEGIN: .Lzasum_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1 ble .Lzasum_kernel_S1
asum_kernel_S4: .Lzasum_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -145,19 +145,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne asum_kernel_S4 bne .Lzasum_kernel_S4
asum_kernel_S1: .Lzasum_kernel_S1:
ands I, N, #3 ands I, N, #3
ble asum_kernel_L999 ble .Lzasum_kernel_L999
asum_kernel_S10: .Lzasum_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne asum_kernel_S10 bne .Lzasum_kernel_S10
ret ret

View File

@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble zaxpy_kernel_L999 ble .Lzaxpy_kernel_L999
mov Y_COPY, Y mov Y_COPY, Y
fcmp DA_R, #0.0 fcmp DA_R, #0.0
bne .L1 bne .L1
fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zaxpy_kernel_L999 beq .Lzaxpy_kernel_L999
.L1: .L1:
INIT INIT
cmp INC_X, #1 cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN bne .Lzaxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN bne .Lzaxpy_kernel_S_BEGIN
zaxpy_kernel_F_BEGIN: .Lzaxpy_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq zaxpy_kernel_F1 beq .Lzaxpy_kernel_F1
KERNEL_INIT_F4 KERNEL_INIT_F4
zaxpy_kernel_F4: .Lzaxpy_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_F4 bne .Lzaxpy_kernel_F4
zaxpy_kernel_F1: .Lzaxpy_kernel_F1:
ands I, N, #3 ands I, N, #3
ble zaxpy_kernel_L999 ble .Lzaxpy_kernel_L999
zaxpy_kernel_F10: .Lzaxpy_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_F10 bne .Lzaxpy_kernel_F10
mov w0, wzr mov w0, wzr
ret ret
zaxpy_kernel_S_BEGIN: .Lzaxpy_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble zaxpy_kernel_S1 ble .Lzaxpy_kernel_S1
zaxpy_kernel_S4: .Lzaxpy_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -304,21 +304,21 @@ zaxpy_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_S4 bne .Lzaxpy_kernel_S4
zaxpy_kernel_S1: .Lzaxpy_kernel_S1:
ands I, N, #3 ands I, N, #3
ble zaxpy_kernel_L999 ble .Lzaxpy_kernel_L999
zaxpy_kernel_S10: .Lzaxpy_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_S10 bne .Lzaxpy_kernel_S10
zaxpy_kernel_L999: .Lzaxpy_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret

View File

@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
cmp N, xzr cmp N, xzr
ble dot_kernel_L999 ble .Lzdot_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne dot_kernel_S_BEGIN bne .Lzdot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne dot_kernel_S_BEGIN bne .Lzdot_kernel_S_BEGIN
dot_kernel_F_BEGIN: .Lzdot_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq dot_kernel_F1 beq .Lzdot_kernel_F1
dot_kernel_F4: .Lzdot_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne dot_kernel_F4 bne .Lzdot_kernel_F4
KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE
dot_kernel_F1: .Lzdot_kernel_F1:
ands I, N, #3 ands I, N, #3
ble dot_kernel_L999 ble .Lzdot_kernel_L999
dot_kernel_F10: .Lzdot_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne dot_kernel_F10 bne .Lzdot_kernel_F10
ret ret
dot_kernel_S_BEGIN: .Lzdot_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble dot_kernel_S1 ble .Lzdot_kernel_S1
dot_kernel_S4: .Lzdot_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -281,21 +281,21 @@ dot_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne dot_kernel_S4 bne .Lzdot_kernel_S4
dot_kernel_S1: .Lzdot_kernel_S1:
ands I, N, #3 ands I, N, #3
ble dot_kernel_L999 ble .Lzdot_kernel_L999
dot_kernel_S10: .Lzdot_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne dot_kernel_S10 bne .Lzdot_kernel_S10
dot_kernel_L999: .Lzdot_kernel_L999:
ret ret

View File

@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN ble .Lzgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN: .Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN: .Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5 .align 5
zgemm_kernel_L4_M4_20: .Lzgemm_kernel_L4_M4_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt zgemm_kernel_L4_M4_32 blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a ble .Lzgemm_kernel_L4_M4_22a
.align 5 .align 5
zgemm_kernel_L4_M4_22: .Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22 bgt .Lzgemm_kernel_L4_M4_22
.align 5 .align 5
zgemm_kernel_L4_M4_22a: .Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b zgemm_kernel_L4_M4_44 b .Lzgemm_kernel_L4_M4_44
.align 5 .align 5
zgemm_kernel_L4_M4_32: .Lzgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble zgemm_kernel_L4_M4_40 ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b zgemm_kernel_L4_M4_44 b .Lzgemm_kernel_L4_M4_44
zgemm_kernel_L4_M4_40: .Lzgemm_kernel_L4_M4_40:
INIT4x4 INIT4x4
zgemm_kernel_L4_M4_44: .Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7 ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100 ble .Lzgemm_kernel_L4_M4_100
.align 5 .align 5
zgemm_kernel_L4_M4_46: .Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46 bne .Lzgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100: .Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
SAVE4x4 SAVE4x4
zgemm_kernel_L4_M4_END: .Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20 bne .Lzgemm_kernel_L4_M4_20
zgemm_kernel_L4_M2_BEGIN: .Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L4_END ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN ble .Lzgemm_kernel_L4_M1_BEGIN
zgemm_kernel_L4_M2_20: .Lzgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M2_40 ble .Lzgemm_kernel_L4_M2_40
zgemm_kernel_L4_M2_22: .Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22 bgt .Lzgemm_kernel_L4_M2_22
zgemm_kernel_L4_M2_40: .Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100 ble .Lzgemm_kernel_L4_M2_100
zgemm_kernel_L4_M2_42: .Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42 bgt .Lzgemm_kernel_L4_M2_42
zgemm_kernel_L4_M2_100: .Lzgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
zgemm_kernel_L4_M2_END: .Lzgemm_kernel_L4_M2_END:
zgemm_kernel_L4_M1_BEGIN: .Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END ble .Lzgemm_kernel_L4_END
zgemm_kernel_L4_M1_20: .Lzgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M1_40 ble .Lzgemm_kernel_L4_M1_40
zgemm_kernel_L4_M1_22: .Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22 bgt .Lzgemm_kernel_L4_M1_22
zgemm_kernel_L4_M1_40: .Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100 ble .Lzgemm_kernel_L4_M1_100
zgemm_kernel_L4_M1_42: .Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42 bgt .Lzgemm_kernel_L4_M1_42
zgemm_kernel_L4_M1_100: .Lzgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
zgemm_kernel_L4_END: .Lzgemm_kernel_L4_END:
lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
zgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble zgemm_kernel_L999 ble .Lzgemm_kernel_L999
tst counterJ , #2 tst counterJ , #2
ble zgemm_kernel_L1_BEGIN ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
zgemm_kernel_L2_M4_BEGIN: .Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN ble .Lzgemm_kernel_L2_M2_BEGIN
zgemm_kernel_L2_M4_20: .Lzgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M4_40 ble .Lzgemm_kernel_L2_M4_40
.align 5 .align 5
zgemm_kernel_L2_M4_22: .Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22 bgt .Lzgemm_kernel_L2_M4_22
zgemm_kernel_L2_M4_40: .Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100 ble .Lzgemm_kernel_L2_M4_100
zgemm_kernel_L2_M4_42: .Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42 bgt .Lzgemm_kernel_L2_M4_42
zgemm_kernel_L2_M4_100: .Lzgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
zgemm_kernel_L2_M4_END: .Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20 bgt .Lzgemm_kernel_L2_M4_20
zgemm_kernel_L2_M2_BEGIN: .Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L2_END ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN ble .Lzgemm_kernel_L2_M1_BEGIN
zgemm_kernel_L2_M2_20: .Lzgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M2_40 ble .Lzgemm_kernel_L2_M2_40
zgemm_kernel_L2_M2_22: .Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22 bgt .Lzgemm_kernel_L2_M2_22
zgemm_kernel_L2_M2_40: .Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100 ble .Lzgemm_kernel_L2_M2_100
zgemm_kernel_L2_M2_42: .Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42 bgt .Lzgemm_kernel_L2_M2_42
zgemm_kernel_L2_M2_100: .Lzgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
zgemm_kernel_L2_M2_END: .Lzgemm_kernel_L2_M2_END:
zgemm_kernel_L2_M1_BEGIN: .Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END ble .Lzgemm_kernel_L2_END
zgemm_kernel_L2_M1_20: .Lzgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble zgemm_kernel_L2_M1_40 ble .Lzgemm_kernel_L2_M1_40
zgemm_kernel_L2_M1_22: .Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22 bgt .Lzgemm_kernel_L2_M1_22
zgemm_kernel_L2_M1_40: .Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100 ble .Lzgemm_kernel_L2_M1_100
zgemm_kernel_L2_M1_42: .Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42 bgt .Lzgemm_kernel_L2_M1_42
zgemm_kernel_L2_M1_100: .Lzgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
zgemm_kernel_L2_END: .Lzgemm_kernel_L2_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2 add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/ /******************************************************************************/
zgemm_kernel_L1_BEGIN: .Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble zgemm_kernel_L999 // done ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN:
zgemm_kernel_L1_M4_BEGIN: .Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN ble .Lzgemm_kernel_L1_M2_BEGIN
zgemm_kernel_L1_M4_20: .Lzgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M4_40 ble .Lzgemm_kernel_L1_M4_40
.align 5 .align 5
zgemm_kernel_L1_M4_22: .Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22 bgt .Lzgemm_kernel_L1_M4_22
zgemm_kernel_L1_M4_40: .Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100 ble .Lzgemm_kernel_L1_M4_100
zgemm_kernel_L1_M4_42: .Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42 bgt .Lzgemm_kernel_L1_M4_42
zgemm_kernel_L1_M4_100: .Lzgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
zgemm_kernel_L1_M4_END: .Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20 bgt .Lzgemm_kernel_L1_M4_20
zgemm_kernel_L1_M2_BEGIN: .Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L1_END ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN ble .Lzgemm_kernel_L1_M1_BEGIN
zgemm_kernel_L1_M2_20: .Lzgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M2_40 ble .Lzgemm_kernel_L1_M2_40
zgemm_kernel_L1_M2_22: .Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22 bgt .Lzgemm_kernel_L1_M2_22
zgemm_kernel_L1_M2_40: .Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100 ble .Lzgemm_kernel_L1_M2_100
zgemm_kernel_L1_M2_42: .Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42 bgt .Lzgemm_kernel_L1_M2_42
zgemm_kernel_L1_M2_100: .Lzgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
zgemm_kernel_L1_M2_END: .Lzgemm_kernel_L1_M2_END:
zgemm_kernel_L1_M1_BEGIN: .Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END ble .Lzgemm_kernel_L1_END
zgemm_kernel_L1_M1_20: .Lzgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M1_40 ble .Lzgemm_kernel_L1_M1_40
zgemm_kernel_L1_M1_22: .Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22 bgt .Lzgemm_kernel_L1_M1_22
zgemm_kernel_L1_M1_40: .Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100 ble .Lzgemm_kernel_L1_M1_100
zgemm_kernel_L1_M1_42: .Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42 bgt .Lzgemm_kernel_L1_M1_42
zgemm_kernel_L1_M1_100: .Lzgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
zgemm_kernel_L1_END: .Lzgemm_kernel_L1_END:
zgemm_kernel_L999: .Lzgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN ble .Lzgemm_kernel_L2_BEGIN
zgemm_kernel_L4_BEGIN: .Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
zgemm_kernel_L4_M4_BEGIN: .Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5 .align 5
zgemm_kernel_L4_M4_20: .Lzgemm_kernel_L4_M4_20:
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt zgemm_kernel_L4_M4_32 blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a ble .Lzgemm_kernel_L4_M4_22a
.align 5 .align 5
zgemm_kernel_L4_M4_22: .Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22 bgt .Lzgemm_kernel_L4_M4_22
.align 5 .align 5
zgemm_kernel_L4_M4_22a: .Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b zgemm_kernel_L4_M4_44 b .Lzgemm_kernel_L4_M4_44
.align 5 .align 5
zgemm_kernel_L4_M4_32: .Lzgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble zgemm_kernel_L4_M4_40 ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b zgemm_kernel_L4_M4_44 b .Lzgemm_kernel_L4_M4_44
zgemm_kernel_L4_M4_40: .Lzgemm_kernel_L4_M4_40:
INIT4x4 INIT4x4
zgemm_kernel_L4_M4_44: .Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7 ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100 ble .Lzgemm_kernel_L4_M4_100
.align 5 .align 5
zgemm_kernel_L4_M4_46: .Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46 bne .Lzgemm_kernel_L4_M4_46
zgemm_kernel_L4_M4_100: .Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
SAVE4x4 SAVE4x4
zgemm_kernel_L4_M4_END: .Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20 bne .Lzgemm_kernel_L4_M4_20
zgemm_kernel_L4_M2_BEGIN: .Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L4_END ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN ble .Lzgemm_kernel_L4_M1_BEGIN
zgemm_kernel_L4_M2_20: .Lzgemm_kernel_L4_M2_20:
INIT2x4 INIT2x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M2_40 ble .Lzgemm_kernel_L4_M2_40
zgemm_kernel_L4_M2_22: .Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22 bgt .Lzgemm_kernel_L4_M2_22
zgemm_kernel_L4_M2_40: .Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100 ble .Lzgemm_kernel_L4_M2_100
zgemm_kernel_L4_M2_42: .Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42 bgt .Lzgemm_kernel_L4_M2_42
zgemm_kernel_L4_M2_100: .Lzgemm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
zgemm_kernel_L4_M2_END: .Lzgemm_kernel_L4_M2_END:
zgemm_kernel_L4_M1_BEGIN: .Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END ble .Lzgemm_kernel_L4_END
zgemm_kernel_L4_M1_20: .Lzgemm_kernel_L4_M1_20:
INIT1x4 INIT1x4
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M1_40 ble .Lzgemm_kernel_L4_M1_40
zgemm_kernel_L4_M1_22: .Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22 bgt .Lzgemm_kernel_L4_M1_22
zgemm_kernel_L4_M1_40: .Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100 ble .Lzgemm_kernel_L4_M1_100
zgemm_kernel_L4_M1_42: .Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42 bgt .Lzgemm_kernel_L4_M1_42
zgemm_kernel_L4_M1_100: .Lzgemm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
zgemm_kernel_L4_END: .Lzgemm_kernel_L4_END:
lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
zgemm_kernel_L2_BEGIN: // less than 2 left in N direction .Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble zgemm_kernel_L999 ble .Lzgemm_kernel_L999
tst counterJ , #2 tst counterJ , #2
ble zgemm_kernel_L1_BEGIN ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
zgemm_kernel_L2_M4_BEGIN: .Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN ble .Lzgemm_kernel_L2_M2_BEGIN
zgemm_kernel_L2_M4_20: .Lzgemm_kernel_L2_M4_20:
INIT4x2 INIT4x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M4_40 ble .Lzgemm_kernel_L2_M4_40
.align 5 .align 5
zgemm_kernel_L2_M4_22: .Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22 bgt .Lzgemm_kernel_L2_M4_22
zgemm_kernel_L2_M4_40: .Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100 ble .Lzgemm_kernel_L2_M4_100
zgemm_kernel_L2_M4_42: .Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42 bgt .Lzgemm_kernel_L2_M4_42
zgemm_kernel_L2_M4_100: .Lzgemm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
zgemm_kernel_L2_M4_END: .Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20 bgt .Lzgemm_kernel_L2_M4_20
zgemm_kernel_L2_M2_BEGIN: .Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L2_END ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN ble .Lzgemm_kernel_L2_M1_BEGIN
zgemm_kernel_L2_M2_20: .Lzgemm_kernel_L2_M2_20:
INIT2x2 INIT2x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M2_40 ble .Lzgemm_kernel_L2_M2_40
zgemm_kernel_L2_M2_22: .Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22 bgt .Lzgemm_kernel_L2_M2_22
zgemm_kernel_L2_M2_40: .Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100 ble .Lzgemm_kernel_L2_M2_100
zgemm_kernel_L2_M2_42: .Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42 bgt .Lzgemm_kernel_L2_M2_42
zgemm_kernel_L2_M2_100: .Lzgemm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
zgemm_kernel_L2_M2_END: .Lzgemm_kernel_L2_M2_END:
zgemm_kernel_L2_M1_BEGIN: .Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END ble .Lzgemm_kernel_L2_END
zgemm_kernel_L2_M1_20: .Lzgemm_kernel_L2_M1_20:
INIT1x2 INIT1x2
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble zgemm_kernel_L2_M1_40 ble .Lzgemm_kernel_L2_M1_40
zgemm_kernel_L2_M1_22: .Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22 bgt .Lzgemm_kernel_L2_M1_22
zgemm_kernel_L2_M1_40: .Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100 ble .Lzgemm_kernel_L2_M1_100
zgemm_kernel_L2_M1_42: .Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42 bgt .Lzgemm_kernel_L2_M1_42
zgemm_kernel_L2_M1_100: .Lzgemm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
zgemm_kernel_L2_END: .Lzgemm_kernel_L2_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2 add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/ /******************************************************************************/
zgemm_kernel_L1_BEGIN: .Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble zgemm_kernel_L999 // done ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN:
zgemm_kernel_L1_M4_BEGIN: .Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN ble .Lzgemm_kernel_L1_M2_BEGIN
zgemm_kernel_L1_M4_20: .Lzgemm_kernel_L1_M4_20:
INIT4x1 INIT4x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M4_40 ble .Lzgemm_kernel_L1_M4_40
.align 5 .align 5
zgemm_kernel_L1_M4_22: .Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22 bgt .Lzgemm_kernel_L1_M4_22
zgemm_kernel_L1_M4_40: .Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100 ble .Lzgemm_kernel_L1_M4_100
zgemm_kernel_L1_M4_42: .Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42 bgt .Lzgemm_kernel_L1_M4_42
zgemm_kernel_L1_M4_100: .Lzgemm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
zgemm_kernel_L1_M4_END: .Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20 bgt .Lzgemm_kernel_L1_M4_20
zgemm_kernel_L1_M2_BEGIN: .Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L1_END ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN ble .Lzgemm_kernel_L1_M1_BEGIN
zgemm_kernel_L1_M2_20: .Lzgemm_kernel_L1_M2_20:
INIT2x1 INIT2x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M2_40 ble .Lzgemm_kernel_L1_M2_40
zgemm_kernel_L1_M2_22: .Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22 bgt .Lzgemm_kernel_L1_M2_22
zgemm_kernel_L1_M2_40: .Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100 ble .Lzgemm_kernel_L1_M2_100
zgemm_kernel_L1_M2_42: .Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42 bgt .Lzgemm_kernel_L1_M2_42
zgemm_kernel_L1_M2_100: .Lzgemm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
zgemm_kernel_L1_M2_END: .Lzgemm_kernel_L1_M2_END:
zgemm_kernel_L1_M1_BEGIN: .Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END ble .Lzgemm_kernel_L1_END
zgemm_kernel_L1_M1_20: .Lzgemm_kernel_L1_M1_20:
INIT1x1 INIT1x1
mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M1_40 ble .Lzgemm_kernel_L1_M1_40
zgemm_kernel_L1_M1_22: .Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22 bgt .Lzgemm_kernel_L1_M1_22
zgemm_kernel_L1_M1_40: .Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100 ble .Lzgemm_kernel_L1_M1_100
zgemm_kernel_L1_M1_42: .Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42 bgt .Lzgemm_kernel_L1_M1_42
zgemm_kernel_L1_M1_100: .Lzgemm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
zgemm_kernel_L1_END: .Lzgemm_kernel_L1_END:
zgemm_kernel_L999: .Lzgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]

View File

@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS
cmp N, xzr cmp N, xzr
ble zgemv_n_kernel_L999 ble .Lzgemv_n_kernel_L999
cmp M, xzr cmp M, xzr
ble zgemv_n_kernel_L999 ble .Lzgemv_n_kernel_L999
lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ lsl INC_X, INC_X, #SHZ
@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT INIT
cmp INC_Y, #1 cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN bne .Lzgemv_n_kernel_S_BEGIN
zgemv_n_kernel_F_LOOP: .Lzgemv_n_kernel_F_LOOP:
mov A_PTR, A mov A_PTR, A
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y
@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP:
asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
beq zgemv_n_kernel_F1 beq .Lzgemv_n_kernel_F1
zgemv_n_kernel_F4: .Lzgemv_n_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_F4 bne .Lzgemv_n_kernel_F4
zgemv_n_kernel_F1: .Lzgemv_n_kernel_F1:
ands I, M, #3 ands I, M, #3
ble zgemv_n_kernel_F_END ble .Lzgemv_n_kernel_F_END
zgemv_n_kernel_F10: .Lzgemv_n_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_F10 bne .Lzgemv_n_kernel_F10
zgemv_n_kernel_F_END: .Lzgemv_n_kernel_F_END:
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_n_kernel_F_LOOP bne .Lzgemv_n_kernel_F_LOOP
b zgemv_n_kernel_L999 b .Lzgemv_n_kernel_L999
zgemv_n_kernel_S_BEGIN: .Lzgemv_n_kernel_S_BEGIN:
INIT_S INIT_S
zgemv_n_kernel_S_LOOP: .Lzgemv_n_kernel_S_LOOP:
mov A_PTR, A mov A_PTR, A
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y
@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP:
asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble zgemv_n_kernel_S1 ble .Lzgemv_n_kernel_S1
zgemv_n_kernel_S4: .Lzgemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -440,27 +440,27 @@ zgemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_S4 bne .Lzgemv_n_kernel_S4
zgemv_n_kernel_S1: .Lzgemv_n_kernel_S1:
ands I, M, #3 ands I, M, #3
ble zgemv_n_kernel_S_END ble .Lzgemv_n_kernel_S_END
zgemv_n_kernel_S10: .Lzgemv_n_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_S10 bne .Lzgemv_n_kernel_S10
zgemv_n_kernel_S_END: .Lzgemv_n_kernel_S_END:
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_n_kernel_S_LOOP bne .Lzgemv_n_kernel_S_LOOP
zgemv_n_kernel_L999: .Lzgemv_n_kernel_L999:
RESTORE_REGS RESTORE_REGS
mov w0, wzr mov w0, wzr

View File

@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS
cmp N, xzr cmp N, xzr
ble zgemv_t_kernel_L999 ble .Lzgemv_t_kernel_L999
cmp M, xzr cmp M, xzr
ble zgemv_t_kernel_L999 ble .Lzgemv_t_kernel_L999
lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ lsl INC_Y, INC_Y, #SHZ
@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT INIT
cmp INC_X, #1 cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN bne .Lzgemv_t_kernel_S_BEGIN
zgemv_t_kernel_F_LOOP: .Lzgemv_t_kernel_F_LOOP:
mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X
@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP:
asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
beq zgemv_t_kernel_F1 beq .Lzgemv_t_kernel_F1
zgemv_t_kernel_F4: .Lzgemv_t_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_F4 bne .Lzgemv_t_kernel_F4
KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE
zgemv_t_kernel_F1: .Lzgemv_t_kernel_F1:
ands I, M, #3 ands I, M, #3
ble zgemv_t_kernel_F_END ble .Lzgemv_t_kernel_F_END
zgemv_t_kernel_F10: .Lzgemv_t_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_F10 bne .Lzgemv_t_kernel_F10
zgemv_t_kernel_F_END: .Lzgemv_t_kernel_F_END:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v4.2s}, [Y] ld1 {v4.2s}, [Y]
@ -355,15 +355,15 @@ zgemv_t_kernel_F_END:
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_t_kernel_F_LOOP bne .Lzgemv_t_kernel_F_LOOP
b zgemv_t_kernel_L999 b .Lzgemv_t_kernel_L999
zgemv_t_kernel_S_BEGIN: .Lzgemv_t_kernel_S_BEGIN:
INIT_S INIT_S
zgemv_t_kernel_S_LOOP: .Lzgemv_t_kernel_S_LOOP:
mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X
@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP:
asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble zgemv_t_kernel_S1 ble .Lzgemv_t_kernel_S1
zgemv_t_kernel_S4: .Lzgemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -381,21 +381,21 @@ zgemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_S4 bne .Lzgemv_t_kernel_S4
zgemv_t_kernel_S1: .Lzgemv_t_kernel_S1:
ands I, M, #3 ands I, M, #3
ble zgemv_t_kernel_S_END ble .Lzgemv_t_kernel_S_END
zgemv_t_kernel_S10: .Lzgemv_t_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_S10 bne .Lzgemv_t_kernel_S10
zgemv_t_kernel_S_END: .Lzgemv_t_kernel_S_END:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v4.2s}, [Y] ld1 {v4.2s}, [Y]
@ -413,9 +413,9 @@ zgemv_t_kernel_S_END:
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_t_kernel_S_LOOP bne .Lzgemv_t_kernel_S_LOOP
zgemv_t_kernel_L999: .Lzgemv_t_kernel_L999:
RESTORE_REGS RESTORE_REGS
mov w0, wzr mov w0, wzr
ret ret

View File

@ -226,43 +226,43 @@ KERNEL_S1_END_\@:
INIT INIT
cmp N, #0 cmp N, #0
ble nrm2_kernel_L999 ble .Lznrm2_kernel_L999
cmp INC_X, #0 cmp INC_X, #0
beq nrm2_kernel_L999 beq .Lznrm2_kernel_L999
cmp INC_X, #1 cmp INC_X, #1
bne nrm2_kernel_S_BEGIN bne .Lznrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN: .Lznrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, xzr cmp I, xzr
ble nrm2_kernel_F1 ble .Lznrm2_kernel_F1
nrm2_kernel_F8: .Lznrm2_kernel_F8:
KERNEL_F8 KERNEL_F8
subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F8 bne .Lznrm2_kernel_F8
nrm2_kernel_F1: .Lznrm2_kernel_F1:
ands I, N, #7 ands I, N, #7
ble nrm2_kernel_L999 ble .Lznrm2_kernel_L999
nrm2_kernel_F10: .Lznrm2_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F10 bne .Lznrm2_kernel_F10
b nrm2_kernel_L999 b .Lznrm2_kernel_L999
nrm2_kernel_S_BEGIN: .Lznrm2_kernel_S_BEGIN:
INIT_S INIT_S
@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN:
.align 5 .align 5
nrm2_kernel_S10: .Lznrm2_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne nrm2_kernel_S10 bne .Lznrm2_kernel_S10
nrm2_kernel_L999: .Lznrm2_kernel_L999:
fsqrt SSQ, SSQ fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ fmul SSQ, SCALE, SSQ

View File

@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE
cmp N, xzr cmp N, xzr
ble rot_kernel_L999 ble .Lzrot_kernel_L999
INIT INIT
cmp INC_X, #1 cmp INC_X, #1
bne rot_kernel_S_BEGIN bne .Lzrot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne rot_kernel_S_BEGIN bne .Lzrot_kernel_S_BEGIN
rot_kernel_F_BEGIN: .Lzrot_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq rot_kernel_F1 beq .Lzrot_kernel_F1
KERNEL_INIT_F4 KERNEL_INIT_F4
rot_kernel_F4: .Lzrot_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne rot_kernel_F4 bne .Lzrot_kernel_F4
rot_kernel_F1: .Lzrot_kernel_F1:
ands I, N, #3 ands I, N, #3
ble rot_kernel_L999 ble .Lzrot_kernel_L999
rot_kernel_F10: .Lzrot_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne rot_kernel_F10 bne .Lzrot_kernel_F10
mov w0, wzr mov w0, wzr
ret ret
rot_kernel_S_BEGIN: .Lzrot_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble rot_kernel_S1 ble .Lzrot_kernel_S1
rot_kernel_S4: .Lzrot_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -236,21 +236,21 @@ rot_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne rot_kernel_S4 bne .Lzrot_kernel_S4
rot_kernel_S1: .Lzrot_kernel_S1:
ands I, N, #3 ands I, N, #3
ble rot_kernel_L999 ble .Lzrot_kernel_L999
rot_kernel_S10: .Lzrot_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne rot_kernel_S10 bne .Lzrot_kernel_S10
rot_kernel_L999: .Lzrot_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret

View File

@ -215,71 +215,71 @@ zscal_begin:
mov X_COPY, X mov X_COPY, X
cmp N, xzr cmp N, xzr
ble zscal_kernel_L999 ble .Lzscal_kernel_L999
fcmp DA_R, #0.0 fcmp DA_R, #0.0
bne zscal_kernel_R_non_zero bne .Lzscal_kernel_R_non_zero
fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zscal_kernel_RI_zero beq .Lzscal_kernel_RI_zero
b zscal_kernel_R_zero b .Lzscal_kernel_R_zero
zscal_kernel_R_non_zero: .Lzscal_kernel_R_non_zero:
fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zscal_kernel_I_zero beq .Lzscal_kernel_I_zero
/******************************************************************************* /*******************************************************************************
* A_R != 0 && A_I != 0 * A_R != 0 && A_I != 0
*******************************************************************************/ *******************************************************************************/
zscal_kernel_RI_non_zero: .Lzscal_kernel_RI_non_zero:
INIT INIT
cmp INC_X, #1 cmp INC_X, #1
bne zscal_kernel_S_BEGIN bne .Lzscal_kernel_S_BEGIN
zscal_kernel_F_BEGIN: .Lzscal_kernel_F_BEGIN:
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq zscal_kernel_F1 beq .Lzscal_kernel_F1
KERNEL_INIT_F4 KERNEL_INIT_F4
zscal_kernel_F4: .Lzscal_kernel_F4:
KERNEL_F4 KERNEL_F4
subs I, I, #1 subs I, I, #1
bne zscal_kernel_F4 bne .Lzscal_kernel_F4
zscal_kernel_F1: .Lzscal_kernel_F1:
ands I, N, #3 ands I, N, #3
ble zscal_kernel_L999 ble .Lzscal_kernel_L999
zscal_kernel_F10: .Lzscal_kernel_F10:
KERNEL_F1 KERNEL_F1
subs I, I, #1 subs I, I, #1
bne zscal_kernel_F10 bne .Lzscal_kernel_F10
mov w0, wzr mov w0, wzr
ret ret
zscal_kernel_S_BEGIN: .Lzscal_kernel_S_BEGIN:
INIT_S INIT_S
asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble zscal_kernel_S1 ble .Lzscal_kernel_S1
zscal_kernel_S4: .Lzscal_kernel_S4:
KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@ -287,21 +287,21 @@ zscal_kernel_S4:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zscal_kernel_S4 bne .Lzscal_kernel_S4
zscal_kernel_S1: .Lzscal_kernel_S1:
ands I, N, #3 ands I, N, #3
ble zscal_kernel_L999 ble .Lzscal_kernel_L999
zscal_kernel_S10: .Lzscal_kernel_S10:
KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne zscal_kernel_S10 bne .Lzscal_kernel_S10
zscal_kernel_L999: .Lzscal_kernel_L999:
mov w0, wzr mov w0, wzr
ret ret
@ -310,7 +310,7 @@ zscal_kernel_L999:
* A_R == 0 && A_I != 0 * A_R == 0 && A_I != 0
*******************************************************************************/ *******************************************************************************/
zscal_kernel_R_zero: .Lzscal_kernel_R_zero:
INIT_S INIT_S
#if !defined(DOUBLE) #if !defined(DOUBLE)
@ -323,7 +323,7 @@ zscal_kernel_R_zero:
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif #endif
zscal_kernel_R_zero_1: .Lzscal_kernel_R_zero_1:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0 ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
@ -337,7 +337,7 @@ zscal_kernel_R_zero_1:
#endif #endif
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_R_zero_1 bne .Lzscal_kernel_R_zero_1
mov w0, wzr mov w0, wzr
ret ret
@ -346,7 +346,7 @@ zscal_kernel_R_zero_1:
* A_R != 0 && A_I == 0 * A_R != 0 && A_I == 0
*******************************************************************************/ *******************************************************************************/
zscal_kernel_I_zero: .Lzscal_kernel_I_zero:
INIT_S INIT_S
#if !defined(DOUBLE) #if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
@ -354,7 +354,7 @@ zscal_kernel_I_zero:
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif #endif
zscal_kernel_I_zero_1: .Lzscal_kernel_I_zero_1:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0 ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
@ -366,7 +366,7 @@ zscal_kernel_I_zero_1:
#endif #endif
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_I_zero_1 bne .Lzscal_kernel_I_zero_1
mov w0, wzr mov w0, wzr
ret ret
@ -375,16 +375,16 @@ zscal_kernel_I_zero_1:
* A_R == 0 && A_I == 0 * A_R == 0 && A_I == 0
*******************************************************************************/ *******************************************************************************/
zscal_kernel_RI_zero: .Lzscal_kernel_RI_zero:
INIT_S INIT_S
zscal_kernel_RI_zero_1: .Lzscal_kernel_RI_zero_1:
stp DA_R, DA_I, [X] stp DA_R, DA_I, [X]
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_RI_zero_1 bne .Lzscal_kernel_RI_zero_1
mov w0, wzr mov w0, wzr
ret ret

View File

@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ztrmm_kernel_L2_BEGIN ble .Lztrmm_kernel_L2_BEGIN
ztrmm_kernel_L4_BEGIN: .Lztrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
ztrmm_kernel_L4_M4_BEGIN: .Lztrmm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN ble .Lztrmm_kernel_L4_M2_BEGIN
.align 5 .align 5
ztrmm_kernel_L4_M4_20: .Lztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20:
asr counterL , tempK, #3 asr counterL , tempK, #3
cmp counterL , #2 cmp counterL , #2
blt ztrmm_kernel_L4_M4_32 blt .Lztrmm_kernel_L4_M4_32
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #2 subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a ble .Lztrmm_kernel_L4_M4_22a
.align 5 .align 5
ztrmm_kernel_L4_M4_22: .Lztrmm_kernel_L4_M4_22:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22 bgt .Lztrmm_kernel_L4_M4_22
.align 5 .align 5
ztrmm_kernel_L4_M4_22a: .Lztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b ztrmm_kernel_L4_M4_44 b .Lztrmm_kernel_L4_M4_44
.align 5 .align 5
ztrmm_kernel_L4_M4_32: .Lztrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble ztrmm_kernel_L4_M4_40 ble .Lztrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b ztrmm_kernel_L4_M4_44 b .Lztrmm_kernel_L4_M4_44
ztrmm_kernel_L4_M4_40: .Lztrmm_kernel_L4_M4_40:
INIT4x4 INIT4x4
ztrmm_kernel_L4_M4_44: .Lztrmm_kernel_L4_M4_44:
ands counterL , tempK, #7 ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100 ble .Lztrmm_kernel_L4_M4_100
.align 5 .align 5
ztrmm_kernel_L4_M4_46: .Lztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46 bne .Lztrmm_kernel_L4_M4_46
ztrmm_kernel_L4_M4_100: .Lztrmm_kernel_L4_M4_100:
SAVE4x4 SAVE4x4
@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]
ztrmm_kernel_L4_M4_END: .Lztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20 bne .Lztrmm_kernel_L4_M4_20
ztrmm_kernel_L4_M2_BEGIN: .Lztrmm_kernel_L4_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L4_END ble .Lztrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L4_M1_BEGIN ble .Lztrmm_kernel_L4_M1_BEGIN
ztrmm_kernel_L4_M2_20: .Lztrmm_kernel_L4_M2_20:
INIT2x4 INIT2x4
@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L4_M2_40 ble .Lztrmm_kernel_L4_M2_40
ztrmm_kernel_L4_M2_22: .Lztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_22 bgt .Lztrmm_kernel_L4_M2_22
ztrmm_kernel_L4_M2_40: .Lztrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M2_100 ble .Lztrmm_kernel_L4_M2_100
ztrmm_kernel_L4_M2_42: .Lztrmm_kernel_L4_M2_42:
KERNEL2x4_SUB KERNEL2x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_42 bgt .Lztrmm_kernel_L4_M2_42
ztrmm_kernel_L4_M2_100: .Lztrmm_kernel_L4_M2_100:
SAVE2x4 SAVE2x4
@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ztrmm_kernel_L4_M2_END: .Lztrmm_kernel_L4_M2_END:
ztrmm_kernel_L4_M1_BEGIN: .Lztrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L4_END ble .Lztrmm_kernel_L4_END
ztrmm_kernel_L4_M1_20: .Lztrmm_kernel_L4_M1_20:
INIT1x4 INIT1x4
@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L4_M1_40 ble .Lztrmm_kernel_L4_M1_40
ztrmm_kernel_L4_M1_22: .Lztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_22 bgt .Lztrmm_kernel_L4_M1_22
ztrmm_kernel_L4_M1_40: .Lztrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M1_100 ble .Lztrmm_kernel_L4_M1_100
ztrmm_kernel_L4_M1_42: .Lztrmm_kernel_L4_M1_42:
KERNEL1x4_SUB KERNEL1x4_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_42 bgt .Lztrmm_kernel_L4_M1_42
ztrmm_kernel_L4_M1_100: .Lztrmm_kernel_L4_M1_100:
SAVE1x4 SAVE1x4
@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100:
#endif #endif
ztrmm_kernel_L4_END: .Lztrmm_kernel_L4_END:
lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2
@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END:
#endif #endif
subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ztrmm_kernel_L4_BEGIN bgt .Lztrmm_kernel_L4_BEGIN
/******************************************************************************/ /******************************************************************************/
ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction .Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ztrmm_kernel_L999 // error, N was less than 4? ble .Lztrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2 tst counterJ , #2
ble ztrmm_kernel_L1_BEGIN ble .Lztrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC
@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A
ztrmm_kernel_L2_M4_BEGIN: .Lztrmm_kernel_L2_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble ztrmm_kernel_L2_M2_BEGIN ble .Lztrmm_kernel_L2_M2_BEGIN
ztrmm_kernel_L2_M4_20: .Lztrmm_kernel_L2_M4_20:
INIT4x2 INIT4x2
@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ztrmm_kernel_L2_M4_40 ble .Lztrmm_kernel_L2_M4_40
.align 5 .align 5
ztrmm_kernel_L2_M4_22: .Lztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_22 bgt .Lztrmm_kernel_L2_M4_22
ztrmm_kernel_L2_M4_40: .Lztrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M4_100 ble .Lztrmm_kernel_L2_M4_100
ztrmm_kernel_L2_M4_42: .Lztrmm_kernel_L2_M4_42:
KERNEL4x2_SUB KERNEL4x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_42 bgt .Lztrmm_kernel_L2_M4_42
ztrmm_kernel_L2_M4_100: .Lztrmm_kernel_L2_M4_100:
SAVE4x2 SAVE4x2
@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ztrmm_kernel_L2_M4_END: .Lztrmm_kernel_L2_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ztrmm_kernel_L2_M4_20 bgt .Lztrmm_kernel_L2_M4_20
ztrmm_kernel_L2_M2_BEGIN: .Lztrmm_kernel_L2_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L2_END ble .Lztrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L2_M1_BEGIN ble .Lztrmm_kernel_L2_M1_BEGIN
ztrmm_kernel_L2_M2_20: .Lztrmm_kernel_L2_M2_20:
INIT2x2 INIT2x2
@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ztrmm_kernel_L2_M2_40 ble .Lztrmm_kernel_L2_M2_40
ztrmm_kernel_L2_M2_22: .Lztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_22 bgt .Lztrmm_kernel_L2_M2_22
ztrmm_kernel_L2_M2_40: .Lztrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M2_100 ble .Lztrmm_kernel_L2_M2_100
ztrmm_kernel_L2_M2_42: .Lztrmm_kernel_L2_M2_42:
KERNEL2x2_SUB KERNEL2x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_42 bgt .Lztrmm_kernel_L2_M2_42
ztrmm_kernel_L2_M2_100: .Lztrmm_kernel_L2_M2_100:
SAVE2x2 SAVE2x2
@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ztrmm_kernel_L2_M2_END: .Lztrmm_kernel_L2_M2_END:
ztrmm_kernel_L2_M1_BEGIN: .Lztrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L2_END ble .Lztrmm_kernel_L2_END
ztrmm_kernel_L2_M1_20: .Lztrmm_kernel_L2_M1_20:
INIT1x2 INIT1x2
@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ztrmm_kernel_L2_M1_40 ble .Lztrmm_kernel_L2_M1_40
ztrmm_kernel_L2_M1_22: .Lztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_22 bgt .Lztrmm_kernel_L2_M1_22
ztrmm_kernel_L2_M1_40: .Lztrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M1_100 ble .Lztrmm_kernel_L2_M1_100
ztrmm_kernel_L2_M1_42: .Lztrmm_kernel_L2_M1_42:
KERNEL1x2_SUB KERNEL1x2_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_42 bgt .Lztrmm_kernel_L2_M1_42
ztrmm_kernel_L2_M1_100: .Lztrmm_kernel_L2_M1_100:
SAVE1x2 SAVE1x2
@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100:
#endif #endif
ztrmm_kernel_L2_END: .Lztrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END:
/******************************************************************************/ /******************************************************************************/
ztrmm_kernel_L1_BEGIN: .Lztrmm_kernel_L1_BEGIN:
mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ztrmm_kernel_L999 // done ble .Lztrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN:
ztrmm_kernel_L1_M4_BEGIN: .Lztrmm_kernel_L1_M4_BEGIN:
mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ztrmm_kernel_L1_M2_BEGIN ble .Lztrmm_kernel_L1_M2_BEGIN
ztrmm_kernel_L1_M4_20: .Lztrmm_kernel_L1_M4_20:
INIT4x1 INIT4x1
@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M4_40 ble .Lztrmm_kernel_L1_M4_40
.align 5 .align 5
ztrmm_kernel_L1_M4_22: .Lztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_22 bgt .Lztrmm_kernel_L1_M4_22
ztrmm_kernel_L1_M4_40: .Lztrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M4_100 ble .Lztrmm_kernel_L1_M4_100
ztrmm_kernel_L1_M4_42: .Lztrmm_kernel_L1_M4_42:
KERNEL4x1_SUB KERNEL4x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_42 bgt .Lztrmm_kernel_L1_M4_42
ztrmm_kernel_L1_M4_100: .Lztrmm_kernel_L1_M4_100:
SAVE4x1 SAVE4x1
@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif
ztrmm_kernel_L1_M4_END: .Lztrmm_kernel_L1_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ztrmm_kernel_L1_M4_20 bgt .Lztrmm_kernel_L1_M4_20
ztrmm_kernel_L1_M2_BEGIN: .Lztrmm_kernel_L1_M2_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L1_END ble .Lztrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L1_M1_BEGIN ble .Lztrmm_kernel_L1_M1_BEGIN
ztrmm_kernel_L1_M2_20: .Lztrmm_kernel_L1_M2_20:
INIT2x1 INIT2x1
@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M2_40 ble .Lztrmm_kernel_L1_M2_40
ztrmm_kernel_L1_M2_22: .Lztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_22 bgt .Lztrmm_kernel_L1_M2_22
ztrmm_kernel_L1_M2_40: .Lztrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M2_100 ble .Lztrmm_kernel_L1_M2_100
ztrmm_kernel_L1_M2_42: .Lztrmm_kernel_L1_M2_42:
KERNEL2x1_SUB KERNEL2x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_42 bgt .Lztrmm_kernel_L1_M2_42
ztrmm_kernel_L1_M2_100: .Lztrmm_kernel_L1_M2_100:
SAVE2x1 SAVE2x1
@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
ztrmm_kernel_L1_M2_END: .Lztrmm_kernel_L1_M2_END:
ztrmm_kernel_L1_M1_BEGIN: .Lztrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L1_END ble .Lztrmm_kernel_L1_END
ztrmm_kernel_L1_M1_20: .Lztrmm_kernel_L1_M1_20:
INIT1x1 INIT1x1
@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20:
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M1_40 ble .Lztrmm_kernel_L1_M1_40
ztrmm_kernel_L1_M1_22: .Lztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_22 bgt .Lztrmm_kernel_L1_M1_22
ztrmm_kernel_L1_M1_40: .Lztrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M1_100 ble .Lztrmm_kernel_L1_M1_100
ztrmm_kernel_L1_M1_42: .Lztrmm_kernel_L1_M1_42:
KERNEL1x1_SUB KERNEL1x1_SUB
subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_42 bgt .Lztrmm_kernel_L1_M1_42
ztrmm_kernel_L1_M1_100: .Lztrmm_kernel_L1_M1_100:
SAVE1x1 SAVE1x1
ztrmm_kernel_L1_END: .Lztrmm_kernel_L1_END:
ztrmm_kernel_L999: .Lztrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]