ARM64: Convert all labels to local labels

While debugging/profiling applications using perf or other tools, the
kernels appear scattered in the profile reports. This is because the labels
within the kernels are not local and each label is shown as a separate
function.

To avoid this, all the labels within the kernels are changed to local
labels.
This commit is contained in:
Ashwin Sekhar T K
2017-10-24 10:47:11 +00:00
parent 627133f9ad
commit a0128aa489
50 changed files with 4469 additions and 4469 deletions

View File

@@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/
.align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
@@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
@@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
@@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
@@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
@@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
.align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
@@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
.align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
@@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pCRow1, pCRow0, LDC
@@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A
dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:
INIT8x2
@@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40
.align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22
dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42
dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:
SAVE8x2
dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
@@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
@@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
@@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
@@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A
dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40
.align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22
dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42
dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:
SAVE8x1
dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
@@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
@@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
@@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
@@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]