OpenBLAS/kernel/power/dtrsm_logic_LT_16x4_power8.S

756 lines
8.6 KiB
ArmAsm

srawi. J, N, 2
ble DSTRM_LT_L4_END
DSTRM_LT_L4_BEGIN:
mr CO, C
mr AO, A
slwi T1, LDC , 2
add C, C, T1
mr KK, OFFSET
srawi. I, M, 4
ble DSTRM_LT_L4x16_END
DSTRM_LT_L4x16_BEGIN:
mr BO, B
li L, -128
mr T1, CO
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
and T1, T1, L
and T2, T2, L
and T3, T3, L
and T4, T4, L
dcbt T1, r0
dcbt T2, r0
dcbt T3, r0
dcbt T4, r0
addi T1, T1, 128
addi T2, T2, 128
addi T3, T3, 128
addi T4, T4, 128
dcbt T1, r0
dcbt T2, r0
dcbt T3, r0
dcbt T4, r0
DSTRM_LT_L4x16_LOOP_START:
INIT_16x4
addic. L, KK, 0
ble- DSTRM_LT_L4x16_SAVE
mtctr L
DSTRM_LT_L4x16_LOOP:
dcbt AO, PRE
dcbt BO, PRE
KERNEL_16x4
bdz- DSTRM_LT_L4x16_SAVE
dcbt AO, PRE
KERNEL_16x4
bdz- DSTRM_LT_L4x16_SAVE
dcbt AO, PRE
KERNEL_16x4
bdz- DSTRM_LT_L4x16_SAVE
dcbt AO, PRE
KERNEL_16x4
bdnz+ DSTRM_LT_L4x16_LOOP
DSTRM_LT_L4x16_SAVE:
SOLVE_LT_16x4
addi CO, CO, 16*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 4+BASE_SHIFT
slwi T4, T4, 2+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 16
addic. I, I, -1
bgt DSTRM_LT_L4x16_BEGIN
DSTRM_LT_L4x16_END:
DSTRM_LT_L4x8_BEGIN:
andi. T2, M, 15
ble DSTRM_LT_L4x1_END
andi. T1, M, 8
ble DSTRM_LT_L4x8_END
mr BO, B
DSTRM_LT_L4x8_LOOP_START:
INIT_8x4
addic. L, KK, 0
ble DSTRM_LT_L4x8_SAVE
DSTRM_LT_L4x8_LOOP:
KERNEL_8x4
addic. L, L, -1
bgt DSTRM_LT_L4x8_LOOP
DSTRM_LT_L4x8_SAVE:
SOLVE_LT_8x4
addi CO, CO, 8*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 3+BASE_SHIFT
slwi T4, T4, 2+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 8
DSTRM_LT_L4x8_END:
DSTRM_LT_L4x4_BEGIN:
andi. T1, M, 4
ble DSTRM_LT_L4x4_END
mr BO, B
DSTRM_LT_L4x4_LOOP_START:
INIT_4x4
addic. L, KK, 0
ble DSTRM_LT_L4x4_SAVE
DSTRM_LT_L4x4_LOOP:
KERNEL_4x4
addic. L, L, -1
bgt DSTRM_LT_L4x4_LOOP
DSTRM_LT_L4x4_SAVE:
SOLVE_LT_4x4
addi CO, CO, 4*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 2+BASE_SHIFT
slwi T4, T4, 2+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 4
DSTRM_LT_L4x4_END:
DSTRM_LT_L4x2_BEGIN:
andi. T1, M, 2
ble DSTRM_LT_L4x2_END
mr BO, B
DSTRM_LT_L4x2_LOOP_START:
INIT_2x4
addic. L, KK, 0
ble DSTRM_LT_L4x2_SAVE
DSTRM_LT_L4x2_LOOP:
KERNEL_2x4
addic. L, L, -1
bgt DSTRM_LT_L4x2_LOOP
DSTRM_LT_L4x2_SAVE:
SOLVE_LT_2x4
addi CO, CO, 2*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 1+BASE_SHIFT
slwi T4, T4, 2+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 2
DSTRM_LT_L4x2_END:
DSTRM_LT_L4x1_BEGIN:
andi. T1, M, 1
ble DSTRM_LT_L4x1_END
mr BO, B
DSTRM_LT_L4x1_LOOP_START:
INIT_1x4
addic. L, KK, 0
ble DSTRM_LT_L4x1_SAVE
DSTRM_LT_L4x1_LOOP:
KERNEL_1x4
addic. L, L, -1
bgt DSTRM_LT_L4x1_LOOP
DSTRM_LT_L4x1_SAVE:
SOLVE_LT_1x4
addi CO, CO, 1*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 0+BASE_SHIFT
slwi T4, T4, 2+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 1
DSTRM_LT_L4x1_END:
slwi T1, K, 2+BASE_SHIFT
add B, B, T1
addic. J, J, -1
bgt DSTRM_LT_L4_BEGIN
andi. T2, N, 3
ble L999
DSTRM_LT_L4_END:
b DSTRM_LT_L2_BEGIN
L999_H1:
b L999
DSTRM_LT_L2_BEGIN:
andi. T1, N, 2
ble DSTRM_LT_L2_END
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
mr KK, OFFSET
srawi. I, M, 4
ble DSTRM_LT_L2x16_END
DSTRM_LT_L2x16_BEGIN:
mr BO, B
DSTRM_LT_L2x16_LOOP_START:
INIT_16x2
addic. L, KK, 0
ble DSTRM_LT_L2x16_SAVE
DSTRM_LT_L2x16_LOOP:
KERNEL_16x2
addic. L, L, -1
bgt DSTRM_LT_L2x16_LOOP
DSTRM_LT_L2x16_SAVE:
SOLVE_LT_16x2
addi CO, CO, 16*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 4+BASE_SHIFT
slwi T4, T4, 1+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 16
addic. I, I, -1
bgt DSTRM_LT_L2x16_BEGIN
DSTRM_LT_L2x16_END:
DSTRM_LT_L2x8_BEGIN:
andi. T2, M, 15
ble DSTRM_LT_L2x1_END
andi. T1, M, 8
ble DSTRM_LT_L2x8_END
mr BO, B
DSTRM_LT_L2x8_LOOP_START:
INIT_8x2
addic. L, KK, 0
ble DSTRM_LT_L2x8_SAVE
DSTRM_LT_L2x8_LOOP:
KERNEL_8x2
addic. L, L, -1
bgt DSTRM_LT_L2x8_LOOP
DSTRM_LT_L2x8_SAVE:
SOLVE_LT_8x2
addi CO, CO, 8*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 3+BASE_SHIFT
slwi T4, T4, 1+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 8
DSTRM_LT_L2x8_END:
DSTRM_LT_L2x4_BEGIN:
andi. T1, M, 4
ble DSTRM_LT_L2x4_END
mr BO, B
DSTRM_LT_L2x4_LOOP_START:
INIT_4x2
addic. L, KK, 0
ble DSTRM_LT_L2x4_SAVE
DSTRM_LT_L2x4_LOOP:
KERNEL_4x2
addic. L, L, -1
bgt DSTRM_LT_L2x4_LOOP
DSTRM_LT_L2x4_SAVE:
SOLVE_LT_4x2
addi CO, CO, 4*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 2+BASE_SHIFT
slwi T4, T4, 1+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 4
DSTRM_LT_L2x4_END:
DSTRM_LT_L2x2_BEGIN:
andi. T1, M, 2
ble DSTRM_LT_L2x2_END
mr BO, B
DSTRM_LT_L2x2_LOOP_START:
INIT_2x2
addic. L, KK, 0
ble DSTRM_LT_L2x2_SAVE
DSTRM_LT_L2x2_LOOP:
KERNEL_2x2
addic. L, L, -1
bgt DSTRM_LT_L2x2_LOOP
DSTRM_LT_L2x2_SAVE:
SOLVE_LT_2x2
addi CO, CO, 2*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 1+BASE_SHIFT
slwi T4, T4, 1+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 2
DSTRM_LT_L2x2_END:
DSTRM_LT_L2x1_BEGIN:
andi. T1, M, 1
ble DSTRM_LT_L2x1_END
mr BO, B
DSTRM_LT_L2x1_LOOP_START:
INIT_1x2
addic. L, KK, 0
ble DSTRM_LT_L2x1_SAVE
DSTRM_LT_L2x1_LOOP:
KERNEL_1x2
addic. L, L, -1
bgt DSTRM_LT_L2x1_LOOP
DSTRM_LT_L2x1_SAVE:
SOLVE_LT_1x2
addi CO, CO, 1*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 0+BASE_SHIFT
slwi T4, T4, 1+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 1
DSTRM_LT_L2x1_END:
slwi T1, K, 1+BASE_SHIFT
add B, B, T1
DSTRM_LT_L2_END:
DSTRM_LT_L1_BEGIN:
andi. T1, N, 1
ble DSTRM_LT_L1_END
mr CO, C
mr AO, A
mr KK, OFFSET
srawi. I, M, 4
ble DSTRM_LT_L1x16_END
DSTRM_LT_L1x16_BEGIN:
mr BO, B
DSTRM_LT_L1x16_LOOP_START:
INIT_16x1
addic. L, KK, 0
ble DSTRM_LT_L1x16_SAVE
DSTRM_LT_L1x16_LOOP:
KERNEL_16x1
addic. L, L, -1
bgt DSTRM_LT_L1x16_LOOP
DSTRM_LT_L1x16_SAVE:
SOLVE_LT_16x1
addi CO, CO, 16*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 4+BASE_SHIFT
slwi T4, T4, 0+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 16
addic. I, I, -1
bgt DSTRM_LT_L1x16_BEGIN
DSTRM_LT_L1x16_END:
DSTRM_LT_L1x8_BEGIN:
andi. T1, M, 8
ble DSTRM_LT_L1x8_END
mr BO, B
DSTRM_LT_L1x8_LOOP_START:
INIT_8x1
addic. L, KK, 0
ble DSTRM_LT_L1x8_SAVE
DSTRM_LT_L1x8_LOOP:
KERNEL_8x1
addic. L, L, -1
bgt DSTRM_LT_L1x8_LOOP
DSTRM_LT_L1x8_SAVE:
SOLVE_LT_8x1
addi CO, CO, 8*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 3+BASE_SHIFT
slwi T4, T4, 0+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 8
DSTRM_LT_L1x8_END:
DSTRM_LT_L1x4_BEGIN:
andi. T1, M, 4
ble DSTRM_LT_L1x4_END
mr BO, B
DSTRM_LT_L1x4_LOOP_START:
INIT_4x1
addic. L, KK, 0
ble DSTRM_LT_L1x4_SAVE
DSTRM_LT_L1x4_LOOP:
KERNEL_4x1
addic. L, L, -1
bgt DSTRM_LT_L1x4_LOOP
DSTRM_LT_L1x4_SAVE:
SOLVE_LT_4x1
addi CO, CO, 4*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 2+BASE_SHIFT
slwi T4, T4, 0+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 4
DSTRM_LT_L1x4_END:
DSTRM_LT_L1x2_BEGIN:
andi. T1, M, 2
ble DSTRM_LT_L1x2_END
mr BO, B
DSTRM_LT_L1x2_LOOP_START:
INIT_2x1
addic. L, KK, 0
ble DSTRM_LT_L1x2_SAVE
DSTRM_LT_L1x2_LOOP:
KERNEL_2x1
addic. L, L, -1
bgt DSTRM_LT_L1x2_LOOP
DSTRM_LT_L1x2_SAVE:
SOLVE_LT_2x1
addi CO, CO, 2*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 1+BASE_SHIFT
slwi T4, T4, 0+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 2
DSTRM_LT_L1x2_END:
DSTRM_LT_L1x1_BEGIN:
andi. T1, M, 1
ble DSTRM_LT_L1x1_END
mr BO, B
DSTRM_LT_L1x1_LOOP_START:
INIT_1x1
addic. L, KK, 0
ble DSTRM_LT_L1x1_SAVE
DSTRM_LT_L1x1_LOOP:
KERNEL_1x1
addic. L, L, -1
bgt DSTRM_LT_L1x1_LOOP
DSTRM_LT_L1x1_SAVE:
SOLVE_LT_1x1
addi CO, CO, 1*SIZE
sub T3, K, KK
sub T4, K, KK
slwi T3, T3, 0+BASE_SHIFT
slwi T4, T4, 0+BASE_SHIFT
add AO, AO, T3
add BO, BO, T4
addi KK, KK, 1
DSTRM_LT_L1x1_END:
DSTRM_LT_L1_END: