ARM64: Convert all labels to local labels

While debugging/profiling applications using perf or other tools, the
kernels appear scattered in the profile reports. This is because the labels
within the kernels are not local and each label is shown as a separate
function.

To avoid this, all the labels within the kernels are changed to local
labels.
This commit is contained in:
Ashwin Sekhar T K
2017-10-24 10:47:11 +00:00
parent 627133f9ad
commit a0128aa489
50 changed files with 4469 additions and 4469 deletions

View File

@@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:
//------------------------------------------------------------------------------
dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #2 // L = K / 4
cmp counterL , #2
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
@@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a
.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22
.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44
dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:
INIT8x4
dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100
.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46
dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp]
@@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:
SAVE8x4
dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20
dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN
dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
@@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN
dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22
dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42
dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:
SAVE2x4
dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:
dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END
dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22
dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42
dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:
SAVE1x4
dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
@@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
@@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:
dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:
dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]