756 lines
8.6 KiB
ArmAsm
756 lines
8.6 KiB
ArmAsm
srawi. J, N, 2
|
|
ble DSTRM_LT_L4_END
|
|
|
|
|
|
DSTRM_LT_L4_BEGIN:
|
|
|
|
mr CO, C
|
|
mr AO, A
|
|
slwi T1, LDC , 2
|
|
add C, C, T1
|
|
|
|
mr KK, OFFSET
|
|
srawi. I, M, 4
|
|
ble DSTRM_LT_L4x16_END
|
|
|
|
|
|
DSTRM_LT_L4x16_BEGIN:
|
|
|
|
mr BO, B
|
|
|
|
li L, -128
|
|
|
|
mr T1, CO
|
|
add T2, T1, LDC
|
|
add T3, T2, LDC
|
|
add T4, T3, LDC
|
|
|
|
and T1, T1, L
|
|
and T2, T2, L
|
|
and T3, T3, L
|
|
and T4, T4, L
|
|
|
|
dcbt T1, r0
|
|
dcbt T2, r0
|
|
dcbt T3, r0
|
|
dcbt T4, r0
|
|
|
|
addi T1, T1, 128
|
|
addi T2, T2, 128
|
|
addi T3, T3, 128
|
|
addi T4, T4, 128
|
|
|
|
dcbt T1, r0
|
|
dcbt T2, r0
|
|
dcbt T3, r0
|
|
dcbt T4, r0
|
|
|
|
|
|
DSTRM_LT_L4x16_LOOP_START:
|
|
|
|
|
|
INIT_16x4
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble- DSTRM_LT_L4x16_SAVE
|
|
mtctr L
|
|
|
|
DSTRM_LT_L4x16_LOOP:
|
|
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL_16x4
|
|
bdz- DSTRM_LT_L4x16_SAVE
|
|
|
|
dcbt AO, PRE
|
|
KERNEL_16x4
|
|
bdz- DSTRM_LT_L4x16_SAVE
|
|
|
|
dcbt AO, PRE
|
|
KERNEL_16x4
|
|
bdz- DSTRM_LT_L4x16_SAVE
|
|
|
|
dcbt AO, PRE
|
|
KERNEL_16x4
|
|
bdnz+ DSTRM_LT_L4x16_LOOP
|
|
|
|
|
|
DSTRM_LT_L4x16_SAVE:
|
|
|
|
SOLVE_LT_16x4
|
|
|
|
addi CO, CO, 16*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 4+BASE_SHIFT
|
|
slwi T4, T4, 2+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 16
|
|
|
|
addic. I, I, -1
|
|
bgt DSTRM_LT_L4x16_BEGIN
|
|
|
|
DSTRM_LT_L4x16_END:
|
|
|
|
|
|
DSTRM_LT_L4x8_BEGIN:
|
|
|
|
andi. T2, M, 15
|
|
ble DSTRM_LT_L4x1_END
|
|
|
|
andi. T1, M, 8
|
|
ble DSTRM_LT_L4x8_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L4x8_LOOP_START:
|
|
|
|
|
|
INIT_8x4
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L4x8_SAVE
|
|
|
|
DSTRM_LT_L4x8_LOOP:
|
|
|
|
|
|
KERNEL_8x4
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L4x8_LOOP
|
|
|
|
|
|
DSTRM_LT_L4x8_SAVE:
|
|
|
|
SOLVE_LT_8x4
|
|
|
|
addi CO, CO, 8*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 3+BASE_SHIFT
|
|
slwi T4, T4, 2+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 8
|
|
|
|
DSTRM_LT_L4x8_END:
|
|
|
|
|
|
DSTRM_LT_L4x4_BEGIN:
|
|
|
|
andi. T1, M, 4
|
|
ble DSTRM_LT_L4x4_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L4x4_LOOP_START:
|
|
|
|
|
|
INIT_4x4
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L4x4_SAVE
|
|
|
|
DSTRM_LT_L4x4_LOOP:
|
|
|
|
|
|
KERNEL_4x4
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L4x4_LOOP
|
|
|
|
|
|
DSTRM_LT_L4x4_SAVE:
|
|
|
|
SOLVE_LT_4x4
|
|
|
|
addi CO, CO, 4*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 2+BASE_SHIFT
|
|
slwi T4, T4, 2+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 4
|
|
|
|
DSTRM_LT_L4x4_END:
|
|
|
|
|
|
DSTRM_LT_L4x2_BEGIN:
|
|
|
|
andi. T1, M, 2
|
|
ble DSTRM_LT_L4x2_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L4x2_LOOP_START:
|
|
|
|
|
|
INIT_2x4
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L4x2_SAVE
|
|
|
|
DSTRM_LT_L4x2_LOOP:
|
|
|
|
|
|
KERNEL_2x4
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L4x2_LOOP
|
|
|
|
|
|
DSTRM_LT_L4x2_SAVE:
|
|
|
|
SOLVE_LT_2x4
|
|
|
|
addi CO, CO, 2*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 1+BASE_SHIFT
|
|
slwi T4, T4, 2+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 2
|
|
|
|
DSTRM_LT_L4x2_END:
|
|
|
|
|
|
DSTRM_LT_L4x1_BEGIN:
|
|
|
|
andi. T1, M, 1
|
|
ble DSTRM_LT_L4x1_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L4x1_LOOP_START:
|
|
|
|
|
|
INIT_1x4
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L4x1_SAVE
|
|
|
|
DSTRM_LT_L4x1_LOOP:
|
|
|
|
|
|
KERNEL_1x4
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L4x1_LOOP
|
|
|
|
|
|
DSTRM_LT_L4x1_SAVE:
|
|
|
|
SOLVE_LT_1x4
|
|
|
|
addi CO, CO, 1*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 0+BASE_SHIFT
|
|
slwi T4, T4, 2+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 1
|
|
|
|
DSTRM_LT_L4x1_END:
|
|
|
|
slwi T1, K, 2+BASE_SHIFT
|
|
add B, B, T1
|
|
|
|
addic. J, J, -1
|
|
bgt DSTRM_LT_L4_BEGIN
|
|
|
|
andi. T2, N, 3
|
|
ble L999
|
|
|
|
DSTRM_LT_L4_END:
|
|
|
|
b DSTRM_LT_L2_BEGIN
|
|
|
|
L999_H1:
|
|
|
|
b L999
|
|
|
|
|
|
DSTRM_LT_L2_BEGIN:
|
|
|
|
andi. T1, N, 2
|
|
ble DSTRM_LT_L2_END
|
|
|
|
mr CO, C
|
|
mr AO, A
|
|
slwi T1, LDC , 1
|
|
add C, C, T1
|
|
|
|
mr KK, OFFSET
|
|
srawi. I, M, 4
|
|
ble DSTRM_LT_L2x16_END
|
|
|
|
|
|
DSTRM_LT_L2x16_BEGIN:
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L2x16_LOOP_START:
|
|
|
|
|
|
INIT_16x2
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L2x16_SAVE
|
|
|
|
DSTRM_LT_L2x16_LOOP:
|
|
|
|
|
|
KERNEL_16x2
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L2x16_LOOP
|
|
|
|
|
|
DSTRM_LT_L2x16_SAVE:
|
|
|
|
SOLVE_LT_16x2
|
|
|
|
addi CO, CO, 16*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 4+BASE_SHIFT
|
|
slwi T4, T4, 1+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 16
|
|
|
|
addic. I, I, -1
|
|
bgt DSTRM_LT_L2x16_BEGIN
|
|
|
|
DSTRM_LT_L2x16_END:
|
|
|
|
|
|
DSTRM_LT_L2x8_BEGIN:
|
|
|
|
andi. T2, M, 15
|
|
ble DSTRM_LT_L2x1_END
|
|
|
|
andi. T1, M, 8
|
|
ble DSTRM_LT_L2x8_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L2x8_LOOP_START:
|
|
|
|
|
|
INIT_8x2
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L2x8_SAVE
|
|
|
|
DSTRM_LT_L2x8_LOOP:
|
|
|
|
|
|
KERNEL_8x2
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L2x8_LOOP
|
|
|
|
|
|
DSTRM_LT_L2x8_SAVE:
|
|
|
|
SOLVE_LT_8x2
|
|
|
|
addi CO, CO, 8*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 3+BASE_SHIFT
|
|
slwi T4, T4, 1+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 8
|
|
|
|
DSTRM_LT_L2x8_END:
|
|
|
|
|
|
DSTRM_LT_L2x4_BEGIN:
|
|
|
|
andi. T1, M, 4
|
|
ble DSTRM_LT_L2x4_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L2x4_LOOP_START:
|
|
|
|
|
|
INIT_4x2
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L2x4_SAVE
|
|
|
|
DSTRM_LT_L2x4_LOOP:
|
|
|
|
|
|
KERNEL_4x2
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L2x4_LOOP
|
|
|
|
|
|
DSTRM_LT_L2x4_SAVE:
|
|
|
|
SOLVE_LT_4x2
|
|
|
|
addi CO, CO, 4*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 2+BASE_SHIFT
|
|
slwi T4, T4, 1+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 4
|
|
|
|
DSTRM_LT_L2x4_END:
|
|
|
|
|
|
DSTRM_LT_L2x2_BEGIN:
|
|
|
|
andi. T1, M, 2
|
|
ble DSTRM_LT_L2x2_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L2x2_LOOP_START:
|
|
|
|
|
|
INIT_2x2
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L2x2_SAVE
|
|
|
|
DSTRM_LT_L2x2_LOOP:
|
|
|
|
|
|
KERNEL_2x2
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L2x2_LOOP
|
|
|
|
|
|
DSTRM_LT_L2x2_SAVE:
|
|
|
|
SOLVE_LT_2x2
|
|
|
|
addi CO, CO, 2*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 1+BASE_SHIFT
|
|
slwi T4, T4, 1+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 2
|
|
|
|
DSTRM_LT_L2x2_END:
|
|
|
|
|
|
DSTRM_LT_L2x1_BEGIN:
|
|
|
|
andi. T1, M, 1
|
|
ble DSTRM_LT_L2x1_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L2x1_LOOP_START:
|
|
|
|
|
|
INIT_1x2
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L2x1_SAVE
|
|
|
|
DSTRM_LT_L2x1_LOOP:
|
|
|
|
|
|
KERNEL_1x2
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L2x1_LOOP
|
|
|
|
|
|
DSTRM_LT_L2x1_SAVE:
|
|
|
|
SOLVE_LT_1x2
|
|
|
|
addi CO, CO, 1*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 0+BASE_SHIFT
|
|
slwi T4, T4, 1+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 1
|
|
|
|
DSTRM_LT_L2x1_END:
|
|
|
|
slwi T1, K, 1+BASE_SHIFT
|
|
add B, B, T1
|
|
|
|
DSTRM_LT_L2_END:
|
|
|
|
DSTRM_LT_L1_BEGIN:
|
|
|
|
andi. T1, N, 1
|
|
ble DSTRM_LT_L1_END
|
|
|
|
mr CO, C
|
|
mr AO, A
|
|
|
|
mr KK, OFFSET
|
|
srawi. I, M, 4
|
|
ble DSTRM_LT_L1x16_END
|
|
|
|
|
|
DSTRM_LT_L1x16_BEGIN:
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L1x16_LOOP_START:
|
|
|
|
|
|
INIT_16x1
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L1x16_SAVE
|
|
|
|
DSTRM_LT_L1x16_LOOP:
|
|
|
|
|
|
KERNEL_16x1
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L1x16_LOOP
|
|
|
|
|
|
DSTRM_LT_L1x16_SAVE:
|
|
|
|
SOLVE_LT_16x1
|
|
|
|
addi CO, CO, 16*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 4+BASE_SHIFT
|
|
slwi T4, T4, 0+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 16
|
|
|
|
addic. I, I, -1
|
|
bgt DSTRM_LT_L1x16_BEGIN
|
|
|
|
DSTRM_LT_L1x16_END:
|
|
|
|
|
|
DSTRM_LT_L1x8_BEGIN:
|
|
|
|
andi. T1, M, 8
|
|
ble DSTRM_LT_L1x8_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L1x8_LOOP_START:
|
|
|
|
|
|
INIT_8x1
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L1x8_SAVE
|
|
|
|
DSTRM_LT_L1x8_LOOP:
|
|
|
|
|
|
KERNEL_8x1
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L1x8_LOOP
|
|
|
|
|
|
DSTRM_LT_L1x8_SAVE:
|
|
|
|
SOLVE_LT_8x1
|
|
|
|
addi CO, CO, 8*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 3+BASE_SHIFT
|
|
slwi T4, T4, 0+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 8
|
|
|
|
DSTRM_LT_L1x8_END:
|
|
|
|
|
|
DSTRM_LT_L1x4_BEGIN:
|
|
|
|
andi. T1, M, 4
|
|
ble DSTRM_LT_L1x4_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L1x4_LOOP_START:
|
|
|
|
|
|
INIT_4x1
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L1x4_SAVE
|
|
|
|
DSTRM_LT_L1x4_LOOP:
|
|
|
|
|
|
KERNEL_4x1
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L1x4_LOOP
|
|
|
|
|
|
DSTRM_LT_L1x4_SAVE:
|
|
|
|
SOLVE_LT_4x1
|
|
|
|
addi CO, CO, 4*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 2+BASE_SHIFT
|
|
slwi T4, T4, 0+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 4
|
|
|
|
DSTRM_LT_L1x4_END:
|
|
|
|
|
|
DSTRM_LT_L1x2_BEGIN:
|
|
|
|
andi. T1, M, 2
|
|
ble DSTRM_LT_L1x2_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L1x2_LOOP_START:
|
|
|
|
|
|
INIT_2x1
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L1x2_SAVE
|
|
|
|
DSTRM_LT_L1x2_LOOP:
|
|
|
|
|
|
KERNEL_2x1
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L1x2_LOOP
|
|
|
|
|
|
DSTRM_LT_L1x2_SAVE:
|
|
|
|
SOLVE_LT_2x1
|
|
|
|
addi CO, CO, 2*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 1+BASE_SHIFT
|
|
slwi T4, T4, 0+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 2
|
|
|
|
DSTRM_LT_L1x2_END:
|
|
|
|
|
|
DSTRM_LT_L1x1_BEGIN:
|
|
|
|
andi. T1, M, 1
|
|
ble DSTRM_LT_L1x1_END
|
|
|
|
mr BO, B
|
|
|
|
|
|
DSTRM_LT_L1x1_LOOP_START:
|
|
|
|
|
|
INIT_1x1
|
|
|
|
|
|
addic. L, KK, 0
|
|
ble DSTRM_LT_L1x1_SAVE
|
|
|
|
DSTRM_LT_L1x1_LOOP:
|
|
|
|
|
|
KERNEL_1x1
|
|
|
|
addic. L, L, -1
|
|
bgt DSTRM_LT_L1x1_LOOP
|
|
|
|
|
|
DSTRM_LT_L1x1_SAVE:
|
|
|
|
SOLVE_LT_1x1
|
|
|
|
addi CO, CO, 1*SIZE
|
|
|
|
sub T3, K, KK
|
|
sub T4, K, KK
|
|
slwi T3, T3, 0+BASE_SHIFT
|
|
slwi T4, T4, 0+BASE_SHIFT
|
|
add AO, AO, T3
|
|
add BO, BO, T4
|
|
addi KK, KK, 1
|
|
|
|
DSTRM_LT_L1x1_END:
|
|
|
|
DSTRM_LT_L1_END:
|