optimized dgemm for POWER8
This commit is contained in:
parent
6f43310de5
commit
56948dbf0f
|
@ -21,7 +21,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
|||
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
|
|
|
@ -134,13 +134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define T4 r12
|
||||
#define T3 r11
|
||||
|
||||
#define o40 r12
|
||||
#define o56 r11
|
||||
|
||||
#define o112 r14
|
||||
#define o8 r15
|
||||
#define o24 r16
|
||||
#define ALPHA r17
|
||||
#define o64 r17
|
||||
#define L r18
|
||||
#define T1 r19
|
||||
#define KK r20
|
||||
#define BB r21
|
||||
#define o80 r20
|
||||
#define o96 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
|
@ -205,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
#else
|
||||
stw r31, 144(SP)
|
||||
stw r30, 148(SP)
|
||||
|
@ -223,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stw r17, 200(SP)
|
||||
stw r16, 204(SP)
|
||||
stw r15, 208(SP)
|
||||
stw r14, 212(SP)
|
||||
#endif
|
||||
|
||||
stfd f1, ALPHA_SP
|
||||
|
@ -263,9 +269,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ble .L999_H1
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi ALPHA, SP, 296
|
||||
addi T1, SP, 296
|
||||
#else
|
||||
addi ALPHA, SP, 224
|
||||
addi T1, SP, 224
|
||||
#endif
|
||||
|
||||
li PRE, 384
|
||||
|
@ -274,8 +280,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
li o24, 24
|
||||
li o32, 32
|
||||
li o48, 48
|
||||
li o64, 64
|
||||
li o80, 80
|
||||
li o96, 96
|
||||
li o112, 112
|
||||
|
||||
lxvdsx alpha_r, 0, ALPHA
|
||||
lxvdsx alpha_r, 0, T1
|
||||
|
||||
#include "dgemm_logic_16x4_power8.S"
|
||||
|
||||
|
@ -323,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
#else
|
||||
lwz r31, 144(SP)
|
||||
lwz r30, 148(SP)
|
||||
|
@ -341,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lwz r17, 200(SP)
|
||||
lwz r16, 204(SP)
|
||||
lwz r15, 208(SP)
|
||||
lwz r14, 212(SP)
|
||||
#endif
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
|
|
|
@ -46,23 +46,28 @@ LDGEMM_L4_BEGIN:
|
|||
srawi. I, M, 4
|
||||
ble LDGEMM_L4x16_END
|
||||
|
||||
.align 5
|
||||
.align 4
|
||||
LDGEMM_L4x16_BEGIN:
|
||||
|
||||
li T4, -128
|
||||
li L, -128
|
||||
|
||||
and T1, CO, T4
|
||||
mr T1, CO
|
||||
add T2, T1, LDC
|
||||
add T3, T2, LDC
|
||||
add T4, T3, LDC
|
||||
|
||||
and T1, T1, L
|
||||
and T2, T2, L
|
||||
and T3, T3, L
|
||||
and T4, T4, L
|
||||
|
||||
dcbt T1, r0
|
||||
dcbt T2, r0
|
||||
dcbt T3, r0
|
||||
dcbt T4, r0
|
||||
|
||||
andi. cr0, CO, 127
|
||||
ble LDGEMM_L4x16_BEGIN_NOPRE
|
||||
mr BO, B
|
||||
srawi. L, K, 1
|
||||
|
||||
addi T1, T1, 128
|
||||
addi T2, T2, 128
|
||||
|
@ -74,55 +79,43 @@ LDGEMM_L4x16_BEGIN:
|
|||
dcbt T3, r0
|
||||
dcbt T4, r0
|
||||
|
||||
|
||||
LDGEMM_L4x16_BEGIN_NOPRE:
|
||||
|
||||
mr BO, B
|
||||
srawi. L, K, 2
|
||||
ble LDGEMM_L4x16_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble LDGEMM_L4x16_SUB4
|
||||
|
||||
.align 5
|
||||
.align 4
|
||||
LDGEMM_L4x16_LOOP_START:
|
||||
|
||||
li o40, 40
|
||||
li o56, 56
|
||||
|
||||
dcbt AO, PRE
|
||||
LOAD4x16_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_I1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_2
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_2
|
||||
|
||||
addic. L, L, -2
|
||||
KERNEL4x16_L2
|
||||
|
||||
ble LDGEMM_L4x16_LOOP_END
|
||||
|
||||
.align 7
|
||||
.align 4
|
||||
|
||||
LDGEMM_L4x16_LOOP:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_2
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_2
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_L1
|
||||
dcbt AO, PRE
|
||||
addic. L, L, -1
|
||||
KERNEL4x16_L2
|
||||
|
||||
bgt LDGEMM_L4x16_LOOP
|
||||
|
||||
.align 5
|
||||
.align 4
|
||||
|
||||
LDGEMM_L4x16_LOOP_END:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_2
|
||||
KERNEL4x16_1
|
||||
KERNEL4x16_E2
|
||||
|
||||
|
@ -132,14 +125,12 @@ LDGEMM_L4x16_SUB4:
|
|||
|
||||
KERNEL4x16_SUBI1
|
||||
KERNEL4x16_SUB1
|
||||
KERNEL4x16_SUB1
|
||||
KERNEL4x16_SUB1
|
||||
|
||||
b LDGEMM_L4x16_SUB1
|
||||
|
||||
LDGEMM_L4x16_SUB0:
|
||||
|
||||
andi. L, K, 3
|
||||
andi. L, K, 1
|
||||
|
||||
KERNEL4x16_SUBI1
|
||||
|
||||
|
@ -149,7 +140,7 @@ LDGEMM_L4x16_SUB0:
|
|||
|
||||
LDGEMM_L4x16_SUB1:
|
||||
|
||||
andi. L, K, 3
|
||||
andi. L, K, 1
|
||||
ble LDGEMM_L4x16_SAVE
|
||||
|
||||
LDGEMM_L4x16_SUB2:
|
||||
|
@ -159,7 +150,7 @@ LDGEMM_L4x16_SUB2:
|
|||
addic. L, L, -1
|
||||
bgt LDGEMM_L4x16_SUB2
|
||||
|
||||
.align 5
|
||||
.align 4
|
||||
LDGEMM_L4x16_SAVE:
|
||||
|
||||
SAVE4x16
|
||||
|
@ -184,15 +175,20 @@ LDGEMM_L4x8_BEGIN:
|
|||
|
||||
LDGEMM_L4x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
LOAD4x8_1
|
||||
KERNEL4x8_I1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
|
@ -203,13 +199,17 @@ LDGEMM_L4x8_LOOP_START:
|
|||
LDGEMM_L4x8_LOOP:
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
|
@ -284,15 +284,18 @@ LDGEMM_L4x4_BEGIN:
|
|||
|
||||
LDGEMM_L4x4_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
LOAD4x4_1
|
||||
KERNEL4x4_I1
|
||||
KERNEL4x4_2
|
||||
KERNEL4x4_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x4_2
|
||||
|
||||
KERNEL4x4_1
|
||||
KERNEL4x4_2
|
||||
KERNEL4x4_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x4_2
|
||||
|
||||
addic. L, L, -2
|
||||
|
@ -305,11 +308,13 @@ LDGEMM_L4x4_LOOP:
|
|||
KERNEL4x4_1
|
||||
KERNEL4x4_2
|
||||
KERNEL4x4_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x4_2
|
||||
|
||||
KERNEL4x4_1
|
||||
KERNEL4x4_2
|
||||
KERNEL4x4_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x4_2
|
||||
|
||||
addic. L, L, -1
|
||||
|
@ -743,15 +748,20 @@ LDGEMM_L2x8_BEGIN:
|
|||
|
||||
LDGEMM_L2x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
LOAD2x8_1
|
||||
KERNEL2x8_I1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
|
@ -762,13 +772,17 @@ LDGEMM_L2x8_LOOP_START:
|
|||
LDGEMM_L2x8_LOOP:
|
||||
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
KERNEL2x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL2x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
|
@ -1287,15 +1301,20 @@ LDGEMM_L1x8_BEGIN:
|
|||
|
||||
LDGEMM_L1x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
LOAD1x8_1
|
||||
KERNEL1x8_I1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
KERNEL1x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
|
||||
KERNEL1x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
KERNEL1x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
|
@ -1306,13 +1325,17 @@ LDGEMM_L1x8_LOOP_START:
|
|||
LDGEMM_L1x8_LOOP:
|
||||
|
||||
KERNEL1x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
KERNEL1x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
|
||||
KERNEL1x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
KERNEL1x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL1x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
|
|
|
@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvdsx vs24, 0, BO
|
||||
lxvdsx vs25, o8, BO
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
lxvd2x vs4, 0, AO
|
||||
lxvd2x vs5, o16, AO
|
||||
lxvd2x vs6, o32, AO
|
||||
lxvd2x vs7, o48, AO
|
||||
lxvd2x vs4, o64, AO
|
||||
lxvd2x vs5, o80, AO
|
||||
lxvd2x vs6, o96, AO
|
||||
lxvd2x vs7, o112, AO
|
||||
|
||||
lxvdsx vs26, o16, BO
|
||||
lxvdsx vs27, o24, BO
|
||||
|
||||
addi AO, AO, 64
|
||||
addi AO, AO, 128
|
||||
addi BO, BO, 32
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL4x16_I1
|
||||
|
||||
xvmuldp vs32, vs0, vs24
|
||||
xvmuldp vs33, vs1, vs24
|
||||
xvmuldp vs34, vs2, vs24
|
||||
xvmuldp vs35, vs3, vs24
|
||||
xvmuldp vs32, vs0, vs24
|
||||
xvmuldp vs33, vs1, vs24
|
||||
xvmuldp vs34, vs2, vs24
|
||||
xvmuldp vs35, vs3, vs24
|
||||
|
||||
lxvd2x vs8, 0, AO
|
||||
lxvd2x vs8, o0, AO
|
||||
lxvd2x vs9, o16, AO
|
||||
lxvd2x vs10, o32, AO
|
||||
lxvd2x vs11, o48, AO
|
||||
|
||||
xvmuldp vs36, vs4, vs24
|
||||
xvmuldp vs37, vs5, vs24
|
||||
xvmuldp vs38, vs6, vs24
|
||||
xvmuldp vs39, vs7, vs24
|
||||
xvmuldp vs36, vs4, vs24
|
||||
xvmuldp vs37, vs5, vs24
|
||||
xvmuldp vs38, vs6, vs24
|
||||
xvmuldp vs39, vs7, vs24
|
||||
|
||||
lxvdsx vs28, 0, BO
|
||||
lxvdsx vs29, o8, BO
|
||||
|
||||
xvmuldp vs40, vs0, vs25
|
||||
xvmuldp vs41, vs1, vs25
|
||||
xvmuldp vs42, vs2, vs25
|
||||
xvmuldp vs43, vs3, vs25
|
||||
xvmuldp vs40, vs0, vs25
|
||||
xvmuldp vs41, vs1, vs25
|
||||
xvmuldp vs42, vs2, vs25
|
||||
xvmuldp vs43, vs3, vs25
|
||||
|
||||
lxvd2x vs10, o32, AO
|
||||
lxvd2x vs11, o48, AO
|
||||
|
||||
xvmuldp vs44, vs4, vs25
|
||||
xvmuldp vs45, vs5, vs25
|
||||
xvmuldp vs46, vs6, vs25
|
||||
xvmuldp vs47, vs7, vs25
|
||||
xvmuldp vs44, vs4, vs25
|
||||
xvmuldp vs45, vs5, vs25
|
||||
xvmuldp vs46, vs6, vs25
|
||||
xvmuldp vs47, vs7, vs25
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
xvmuldp vs48, vs0, vs26
|
||||
xvmuldp vs49, vs1, vs26
|
||||
xvmuldp vs50, vs2, vs26
|
||||
xvmuldp vs51, vs3, vs26
|
||||
xvmuldp vs48, vs0, vs26
|
||||
xvmuldp vs49, vs1, vs26
|
||||
xvmuldp vs50, vs2, vs26
|
||||
xvmuldp vs51, vs3, vs26
|
||||
|
||||
lxvd2x vs12, 0, AO
|
||||
lxvd2x vs13, o16, AO
|
||||
lxvd2x vs12, o64, AO
|
||||
lxvd2x vs13, o80, AO
|
||||
|
||||
xvmuldp vs52, vs4, vs26
|
||||
xvmuldp vs53, vs5, vs26
|
||||
xvmuldp vs54, vs6, vs26
|
||||
xvmuldp vs55, vs7, vs26
|
||||
xvmuldp vs52, vs4, vs26
|
||||
xvmuldp vs53, vs5, vs26
|
||||
xvmuldp vs54, vs6, vs26
|
||||
xvmuldp vs55, vs7, vs26
|
||||
|
||||
lxvd2x vs14, o32, AO
|
||||
lxvd2x vs15, o48, AO
|
||||
lxvd2x vs14, o96, AO
|
||||
lxvd2x vs15, o112, AO
|
||||
|
||||
xvmuldp vs56, vs0, vs27
|
||||
xvmuldp vs57, vs1, vs27
|
||||
xvmuldp vs58, vs2, vs27
|
||||
xvmuldp vs59, vs3, vs27
|
||||
|
||||
xvmuldp vs56, vs0, vs27
|
||||
xvmuldp vs57, vs1, vs27
|
||||
xvmuldp vs58, vs2, vs27
|
||||
xvmuldp vs59, vs3, vs27
|
||||
|
||||
lxvdsx vs30, o16, BO
|
||||
lxvdsx vs31, o24, BO
|
||||
|
||||
xvmuldp vs60, vs4, vs27
|
||||
xvmuldp vs61, vs5, vs27
|
||||
xvmuldp vs62, vs6, vs27
|
||||
xvmuldp vs63, vs7, vs27
|
||||
xvmuldp vs60, vs4, vs27
|
||||
xvmuldp vs61, vs5, vs27
|
||||
xvmuldp vs62, vs6, vs27
|
||||
xvmuldp vs63, vs7, vs27
|
||||
|
||||
addi AO, AO, 64
|
||||
addi BO, BO, 32
|
||||
addi AO, AO, 128
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
.macro KERNEL4x16_1
|
||||
|
||||
xvmaddadp vs32, vs0, vs24
|
||||
|
@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs34, vs2, vs24
|
||||
xvmaddadp vs35, vs3, vs24
|
||||
|
||||
lxvd2x vs8, 0, AO
|
||||
lxvd2x vs8, o0, AO
|
||||
lxvd2x vs9, o16, AO
|
||||
lxvd2x vs10, o32, AO
|
||||
lxvd2x vs11, o48, AO
|
||||
|
||||
xvmaddadp vs36, vs4, vs24
|
||||
xvmaddadp vs37, vs5, vs24
|
||||
|
@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs42, vs2, vs25
|
||||
xvmaddadp vs43, vs3, vs25
|
||||
|
||||
lxvd2x vs10, o32, AO
|
||||
lxvd2x vs11, o48, AO
|
||||
|
||||
xvmaddadp vs44, vs4, vs25
|
||||
xvmaddadp vs45, vs5, vs25
|
||||
xvmaddadp vs46, vs6, vs25
|
||||
xvmaddadp vs47, vs7, vs25
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
xvmaddadp vs48, vs0, vs26
|
||||
xvmaddadp vs49, vs1, vs26
|
||||
xvmaddadp vs50, vs2, vs26
|
||||
xvmaddadp vs51, vs3, vs26
|
||||
|
||||
lxvd2x vs12, 0, AO
|
||||
lxvd2x vs13, o16, AO
|
||||
lxvd2x vs12, o64, AO
|
||||
lxvd2x vs13, o80, AO
|
||||
|
||||
xvmaddadp vs52, vs4, vs26
|
||||
xvmaddadp vs53, vs5, vs26
|
||||
xvmaddadp vs54, vs6, vs26
|
||||
xvmaddadp vs55, vs7, vs26
|
||||
|
||||
lxvd2x vs14, o32, AO
|
||||
lxvd2x vs15, o48, AO
|
||||
lxvd2x vs14, o96, AO
|
||||
lxvd2x vs15, o112, AO
|
||||
|
||||
xvmaddadp vs56, vs0, vs27
|
||||
xvmaddadp vs57, vs1, vs27
|
||||
|
@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs62, vs6, vs27
|
||||
xvmaddadp vs63, vs7, vs27
|
||||
|
||||
addi AO, AO, 64
|
||||
addi AO, AO, 128
|
||||
addi BO, BO, 32
|
||||
|
||||
.endm
|
||||
|
@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs46, vs14, vs29
|
||||
xvmaddadp vs47, vs15, vs29
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
xvmaddadp vs48, vs8, vs30
|
||||
xvmaddadp vs49, vs9, vs30
|
||||
xvmaddadp vs50, vs10, vs30
|
||||
xvmaddadp vs51, vs11, vs30
|
||||
|
||||
lxvd2x vs4, 0, AO
|
||||
lxvd2x vs5, o16, AO
|
||||
lxvd2x vs4, o64, AO
|
||||
lxvd2x vs5, o80, AO
|
||||
|
||||
xvmaddadp vs52, vs12, vs30
|
||||
xvmaddadp vs53, vs13, vs30
|
||||
xvmaddadp vs54, vs14, vs30
|
||||
xvmaddadp vs55, vs15, vs30
|
||||
|
||||
lxvd2x vs6, o32, AO
|
||||
lxvd2x vs7, o48, AO
|
||||
lxvd2x vs6, o96, AO
|
||||
lxvd2x vs7, o112, AO
|
||||
|
||||
xvmaddadp vs56, vs8, vs31
|
||||
xvmaddadp vs57, vs9, vs31
|
||||
|
@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs62, vs14, vs31
|
||||
xvmaddadp vs63, vs15, vs31
|
||||
|
||||
addi AO, AO, 64
|
||||
addi AO, AO, 128
|
||||
addi BO, BO, 32
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x16_L1
|
||||
|
||||
xvmaddadp vs32, vs0, vs24
|
||||
xvmaddadp vs33, vs1, vs24
|
||||
xvmaddadp vs34, vs2, vs24
|
||||
xvmaddadp vs35, vs3, vs24
|
||||
|
||||
lxvd2x vs8, o0, AO
|
||||
lxvd2x vs9, o16, AO
|
||||
lxvd2x vs10, o32, AO
|
||||
lxvd2x vs11, o48, AO
|
||||
|
||||
xvmaddadp vs36, vs4, vs24
|
||||
xvmaddadp vs37, vs5, vs24
|
||||
xvmaddadp vs38, vs6, vs24
|
||||
xvmaddadp vs39, vs7, vs24
|
||||
|
||||
lxvdsx vs28, 0, BO
|
||||
lxvdsx vs29, o8, BO
|
||||
|
||||
xvmaddadp vs40, vs0, vs25
|
||||
xvmaddadp vs41, vs1, vs25
|
||||
xvmaddadp vs42, vs2, vs25
|
||||
xvmaddadp vs43, vs3, vs25
|
||||
|
||||
|
||||
xvmaddadp vs44, vs4, vs25
|
||||
xvmaddadp vs45, vs5, vs25
|
||||
xvmaddadp vs46, vs6, vs25
|
||||
xvmaddadp vs47, vs7, vs25
|
||||
|
||||
|
||||
xvmaddadp vs48, vs0, vs26
|
||||
xvmaddadp vs49, vs1, vs26
|
||||
xvmaddadp vs50, vs2, vs26
|
||||
xvmaddadp vs51, vs3, vs26
|
||||
|
||||
lxvd2x vs12, o64, AO
|
||||
lxvd2x vs13, o80, AO
|
||||
|
||||
xvmaddadp vs52, vs4, vs26
|
||||
xvmaddadp vs53, vs5, vs26
|
||||
xvmaddadp vs54, vs6, vs26
|
||||
xvmaddadp vs55, vs7, vs26
|
||||
|
||||
lxvd2x vs14, o96, AO
|
||||
lxvd2x vs15, o112, AO
|
||||
|
||||
xvmaddadp vs56, vs0, vs27
|
||||
xvmaddadp vs57, vs1, vs27
|
||||
xvmaddadp vs58, vs2, vs27
|
||||
xvmaddadp vs59, vs3, vs27
|
||||
|
||||
|
||||
lxvdsx vs30, o16, BO
|
||||
lxvdsx vs31, o24, BO
|
||||
|
||||
xvmaddadp vs60, vs4, vs27
|
||||
xvmaddadp vs61, vs5, vs27
|
||||
xvmaddadp vs62, vs6, vs27
|
||||
xvmaddadp vs63, vs7, vs27
|
||||
|
||||
addi AO, AO, 128
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x16_L2
|
||||
|
||||
xvmaddadp vs32, vs8, vs28
|
||||
xvmaddadp vs33, vs9, vs28
|
||||
xvmaddadp vs34, vs10, vs28
|
||||
xvmaddadp vs35, vs11, vs28
|
||||
|
||||
lxvd2x vs0, 0, AO
|
||||
lxvd2x vs1, o16, AO
|
||||
|
||||
xvmaddadp vs36, vs12, vs28
|
||||
xvmaddadp vs37, vs13, vs28
|
||||
xvmaddadp vs38, vs14, vs28
|
||||
xvmaddadp vs39, vs15, vs28
|
||||
|
||||
lxvdsx vs24, o32, BO
|
||||
lxvdsx vs25, o40, BO
|
||||
|
||||
xvmaddadp vs40, vs8, vs29
|
||||
xvmaddadp vs41, vs9, vs29
|
||||
xvmaddadp vs42, vs10, vs29
|
||||
xvmaddadp vs43, vs11, vs29
|
||||
|
||||
lxvd2x vs2, o32, AO
|
||||
lxvd2x vs3, o48, AO
|
||||
|
||||
xvmaddadp vs44, vs12, vs29
|
||||
xvmaddadp vs45, vs13, vs29
|
||||
xvmaddadp vs46, vs14, vs29
|
||||
xvmaddadp vs47, vs15, vs29
|
||||
|
||||
|
||||
xvmaddadp vs48, vs8, vs30
|
||||
xvmaddadp vs49, vs9, vs30
|
||||
xvmaddadp vs50, vs10, vs30
|
||||
xvmaddadp vs51, vs11, vs30
|
||||
|
||||
lxvd2x vs4, o64, AO
|
||||
lxvd2x vs5, o80, AO
|
||||
|
||||
xvmaddadp vs52, vs12, vs30
|
||||
xvmaddadp vs53, vs13, vs30
|
||||
xvmaddadp vs54, vs14, vs30
|
||||
xvmaddadp vs55, vs15, vs30
|
||||
|
||||
lxvd2x vs6, o96, AO
|
||||
lxvd2x vs7, o112, AO
|
||||
|
||||
xvmaddadp vs56, vs8, vs31
|
||||
xvmaddadp vs57, vs9, vs31
|
||||
xvmaddadp vs58, vs10, vs31
|
||||
xvmaddadp vs59, vs11, vs31
|
||||
|
||||
lxvdsx vs26, o48, BO
|
||||
lxvdsx vs27, o56, BO
|
||||
|
||||
xvmaddadp vs60, vs12, vs31
|
||||
addi AO, AO, 128
|
||||
xvmaddadp vs61, vs13, vs31
|
||||
xvmaddadp vs62, vs14, vs31
|
||||
addi BO, BO, 64
|
||||
xvmaddadp vs63, vs15, vs31
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL4x16_E2
|
||||
|
||||
|
||||
|
@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvdsx vs26, o16, BO
|
||||
lxvdsx vs27, o24, BO
|
||||
|
||||
addi AO, AO, 64
|
||||
addi BO, BO, 32
|
||||
|
||||
lxvd2x vs4, 0, AO
|
||||
lxvd2x vs5, o16, AO
|
||||
lxvd2x vs6, o32, AO
|
||||
lxvd2x vs7, o48, AO
|
||||
lxvd2x vs4, o64, AO
|
||||
lxvd2x vs5, o80, AO
|
||||
lxvd2x vs6, o96, AO
|
||||
lxvd2x vs7, o112, AO
|
||||
|
||||
addi AO, AO, 64
|
||||
|
||||
|
||||
xvmaddadp vs32, vs0, vs24
|
||||
|
@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs41, vs1, vs25
|
||||
xvmaddadp vs42, vs2, vs25
|
||||
xvmaddadp vs43, vs3, vs25
|
||||
addi BO, BO, 32
|
||||
xvmaddadp vs44, vs4, vs25
|
||||
xvmaddadp vs45, vs5, vs25
|
||||
xvmaddadp vs46, vs6, vs25
|
||||
|
@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs49, vs1, vs26
|
||||
xvmaddadp vs50, vs2, vs26
|
||||
xvmaddadp vs51, vs3, vs26
|
||||
addi AO, AO, 128
|
||||
xvmaddadp vs52, vs4, vs26
|
||||
xvmaddadp vs53, vs5, vs26
|
||||
xvmaddadp vs54, vs6, vs26
|
||||
|
@ -430,33 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro SAVE4x16
|
||||
|
||||
mr T1, CO
|
||||
addi T2, T1, 64
|
||||
add T3, T1, LDC
|
||||
addi T4, T3, 64
|
||||
add T2, T1, LDC
|
||||
add T3, T2, LDC
|
||||
add T4, T3, LDC
|
||||
|
||||
#ifndef TRMMKERNEL
|
||||
lxvd2x vs0, 0, T1
|
||||
lxvd2x vs1, o16, T1
|
||||
lxvd2x vs2, o32, T1
|
||||
lxvd2x vs3, o48, T1
|
||||
lxvd2x vs0, 0, CO
|
||||
lxvd2x vs1, o16, CO
|
||||
lxvd2x vs2, o32, CO
|
||||
lxvd2x vs3, o48, CO
|
||||
lxvd2x vs4, o64, CO
|
||||
lxvd2x vs5, o80, CO
|
||||
lxvd2x vs6, o96, CO
|
||||
lxvd2x vs7, o112, CO
|
||||
|
||||
lxvd2x vs4, 0, T2
|
||||
lxvd2x vs5, o16, T2
|
||||
lxvd2x vs6, o32, T2
|
||||
lxvd2x vs7, o48, T2
|
||||
lxvd2x vs8, 0, T2
|
||||
lxvd2x vs9, o16, T2
|
||||
lxvd2x vs10, o32, T2
|
||||
lxvd2x vs11, o48, T2
|
||||
lxvd2x vs12, o64, T2
|
||||
lxvd2x vs13, o80, T2
|
||||
lxvd2x vs14, o96, T2
|
||||
lxvd2x vs15, o112, T2
|
||||
|
||||
lxvd2x vs8, 0, T3
|
||||
lxvd2x vs9, o16, T3
|
||||
lxvd2x vs10, o32, T3
|
||||
lxvd2x vs11, o48, T3
|
||||
lxvd2x vs24, 0, T3
|
||||
lxvd2x vs25, o16, T3
|
||||
lxvd2x vs26, o32, T3
|
||||
lxvd2x vs27, o48, T3
|
||||
lxvd2x vs28, o64, T3
|
||||
lxvd2x vs29, o80, T3
|
||||
lxvd2x vs30, o96, T3
|
||||
lxvd2x vs31, o112, T3
|
||||
|
||||
lxvd2x vs12, 0, T4
|
||||
lxvd2x vs13, o16, T4
|
||||
lxvd2x vs14, o32, T4
|
||||
lxvd2x vs15, o48, T4
|
||||
#endif
|
||||
|
||||
#ifndef TRMMKERNEL
|
||||
xvmaddadp vs0, vs32, alpha_r
|
||||
xvmaddadp vs1, vs33, alpha_r
|
||||
xvmaddadp vs2, vs34, alpha_r
|
||||
|
@ -465,139 +599,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs5, vs37, alpha_r
|
||||
xvmaddadp vs6, vs38, alpha_r
|
||||
xvmaddadp vs7, vs39, alpha_r
|
||||
|
||||
lxvd2x vs32, 0, T4
|
||||
lxvd2x vs33, o16, T4
|
||||
lxvd2x vs34, o32, T4
|
||||
lxvd2x vs35, o48, T4
|
||||
lxvd2x vs36, o64, T4
|
||||
lxvd2x vs37, o80, T4
|
||||
lxvd2x vs38, o96, T4
|
||||
lxvd2x vs39, o112, T4
|
||||
|
||||
xvmaddadp vs8, vs40, alpha_r
|
||||
xvmaddadp vs9, vs41, alpha_r
|
||||
xvmaddadp vs10, vs42, alpha_r
|
||||
xvmaddadp vs11, vs43, alpha_r
|
||||
|
||||
stxvd2x vs0, 0, T1
|
||||
stxvd2x vs1, o16, T1
|
||||
stxvd2x vs2, o32, T1
|
||||
stxvd2x vs3, o48, T1
|
||||
|
||||
xvmaddadp vs12, vs44, alpha_r
|
||||
xvmaddadp vs13, vs45, alpha_r
|
||||
xvmaddadp vs14, vs46, alpha_r
|
||||
xvmaddadp vs15, vs47, alpha_r
|
||||
#else
|
||||
xvmuldp vs0, vs32, alpha_r
|
||||
xvmuldp vs1, vs33, alpha_r
|
||||
xvmuldp vs2, vs34, alpha_r
|
||||
xvmuldp vs3, vs35, alpha_r
|
||||
xvmuldp vs4, vs36, alpha_r
|
||||
xvmuldp vs5, vs37, alpha_r
|
||||
xvmuldp vs6, vs38, alpha_r
|
||||
xvmuldp vs7, vs39, alpha_r
|
||||
xvmuldp vs8, vs40, alpha_r
|
||||
xvmuldp vs9, vs41, alpha_r
|
||||
xvmuldp vs10, vs42, alpha_r
|
||||
xvmuldp vs11, vs43, alpha_r
|
||||
xvmuldp vs12, vs44, alpha_r
|
||||
xvmuldp vs13, vs45, alpha_r
|
||||
xvmuldp vs14, vs46, alpha_r
|
||||
xvmuldp vs15, vs47, alpha_r
|
||||
#endif
|
||||
|
||||
stxvd2x vs0, 0, T1
|
||||
stxvd2x vs1, o16, T1
|
||||
stxvd2x vs2, o32, T1
|
||||
stxvd2x vs3, o48, T1
|
||||
stxvd2x vs4, o64, T1
|
||||
stxvd2x vs5, o80, T1
|
||||
stxvd2x vs6, o96, T1
|
||||
stxvd2x vs7, o112, T1
|
||||
|
||||
stxvd2x vs4, 0, T2
|
||||
stxvd2x vs5, o16, T2
|
||||
stxvd2x vs6, o32, T2
|
||||
stxvd2x vs7, o48, T2
|
||||
xvmaddadp vs24, vs48, alpha_r
|
||||
xvmaddadp vs25, vs49, alpha_r
|
||||
xvmaddadp vs26, vs50, alpha_r
|
||||
xvmaddadp vs27, vs51, alpha_r
|
||||
|
||||
stxvd2x vs8, 0, T3
|
||||
stxvd2x vs9, o16, T3
|
||||
stxvd2x vs10, o32, T3
|
||||
stxvd2x vs11, o48, T3
|
||||
stxvd2x vs8, o0, T2
|
||||
stxvd2x vs9, o16, T2
|
||||
stxvd2x vs10, o32, T2
|
||||
stxvd2x vs11, o48, T2
|
||||
|
||||
stxvd2x vs12, 0, T4
|
||||
stxvd2x vs13, o16, T4
|
||||
stxvd2x vs14, o32, T4
|
||||
stxvd2x vs15, o48, T4
|
||||
xvmaddadp vs28, vs52, alpha_r
|
||||
xvmaddadp vs29, vs53, alpha_r
|
||||
xvmaddadp vs30, vs54, alpha_r
|
||||
xvmaddadp vs31, vs55, alpha_r
|
||||
|
||||
slwi T4, LDC, 1
|
||||
add T1, T1, T4
|
||||
add T3, T3, T4
|
||||
addi T2, T1, 64
|
||||
addi T4, T3, 64
|
||||
stxvd2x vs12, o64, T2
|
||||
stxvd2x vs13, o80, T2
|
||||
stxvd2x vs14, o96, T2
|
||||
stxvd2x vs15, o112, T2
|
||||
|
||||
#ifndef TRMMKERNEL
|
||||
lxvd2x vs0, 0, T1
|
||||
lxvd2x vs1, o16, T1
|
||||
lxvd2x vs2, o32, T1
|
||||
lxvd2x vs3, o48, T1
|
||||
xvmaddadp vs32, vs56, alpha_r
|
||||
xvmaddadp vs33, vs57, alpha_r
|
||||
xvmaddadp vs34, vs58, alpha_r
|
||||
xvmaddadp vs35, vs59, alpha_r
|
||||
|
||||
lxvd2x vs4, 0, T2
|
||||
lxvd2x vs5, o16, T2
|
||||
lxvd2x vs6, o32, T2
|
||||
lxvd2x vs7, o48, T2
|
||||
stxvd2x vs24, 0, T3
|
||||
stxvd2x vs25, o16, T3
|
||||
stxvd2x vs26, o32, T3
|
||||
stxvd2x vs27, o48, T3
|
||||
|
||||
lxvd2x vs8, 0, T3
|
||||
lxvd2x vs9, o16, T3
|
||||
lxvd2x vs10, o32, T3
|
||||
lxvd2x vs11, o48, T3
|
||||
xvmaddadp vs36, vs60, alpha_r
|
||||
xvmaddadp vs37, vs61, alpha_r
|
||||
xvmaddadp vs38, vs62, alpha_r
|
||||
xvmaddadp vs39, vs63, alpha_r
|
||||
|
||||
lxvd2x vs12, 0, T4
|
||||
lxvd2x vs13, o16, T4
|
||||
lxvd2x vs14, o32, T4
|
||||
lxvd2x vs15, o48, T4
|
||||
#endif
|
||||
stxvd2x vs28, o64, T3
|
||||
stxvd2x vs29, o80, T3
|
||||
stxvd2x vs30, o96, T3
|
||||
stxvd2x vs31, o112, T3
|
||||
|
||||
#ifndef TRMMKERNEL
|
||||
xvmaddadp vs0, vs48, alpha_r
|
||||
xvmaddadp vs1, vs49, alpha_r
|
||||
xvmaddadp vs2, vs50, alpha_r
|
||||
xvmaddadp vs3, vs51, alpha_r
|
||||
xvmaddadp vs4, vs52, alpha_r
|
||||
xvmaddadp vs5, vs53, alpha_r
|
||||
xvmaddadp vs6, vs54, alpha_r
|
||||
xvmaddadp vs7, vs55, alpha_r
|
||||
xvmaddadp vs8, vs56, alpha_r
|
||||
xvmaddadp vs9, vs57, alpha_r
|
||||
xvmaddadp vs10, vs58, alpha_r
|
||||
xvmaddadp vs11, vs59, alpha_r
|
||||
xvmaddadp vs12, vs60, alpha_r
|
||||
xvmaddadp vs13, vs61, alpha_r
|
||||
xvmaddadp vs14, vs62, alpha_r
|
||||
xvmaddadp vs15, vs63, alpha_r
|
||||
#else
|
||||
xvmuldp vs0, vs48, alpha_r
|
||||
xvmuldp vs1, vs49, alpha_r
|
||||
xvmuldp vs2, vs50, alpha_r
|
||||
xvmuldp vs3, vs51, alpha_r
|
||||
xvmuldp vs4, vs52, alpha_r
|
||||
xvmuldp vs5, vs53, alpha_r
|
||||
xvmuldp vs6, vs54, alpha_r
|
||||
xvmuldp vs7, vs55, alpha_r
|
||||
xvmuldp vs8, vs56, alpha_r
|
||||
xvmuldp vs9, vs57, alpha_r
|
||||
xvmuldp vs10, vs58, alpha_r
|
||||
xvmuldp vs11, vs59, alpha_r
|
||||
xvmuldp vs12, vs60, alpha_r
|
||||
xvmuldp vs13, vs61, alpha_r
|
||||
xvmuldp vs14, vs62, alpha_r
|
||||
xvmuldp vs15, vs63, alpha_r
|
||||
#endif
|
||||
|
||||
stxvd2x vs0, 0, T1
|
||||
stxvd2x vs1, o16, T1
|
||||
stxvd2x vs2, o32, T1
|
||||
stxvd2x vs3, o48, T1
|
||||
|
||||
stxvd2x vs4, 0, T2
|
||||
stxvd2x vs5, o16, T2
|
||||
stxvd2x vs6, o32, T2
|
||||
stxvd2x vs7, o48, T2
|
||||
|
||||
stxvd2x vs8, 0, T3
|
||||
stxvd2x vs9, o16, T3
|
||||
stxvd2x vs10, o32, T3
|
||||
stxvd2x vs11, o48, T3
|
||||
|
||||
stxvd2x vs12, 0, T4
|
||||
stxvd2x vs13, o16, T4
|
||||
stxvd2x vs14, o32, T4
|
||||
stxvd2x vs15, o48, T4
|
||||
stxvd2x vs32, o0, T4
|
||||
stxvd2x vs33, o16, T4
|
||||
stxvd2x vs34, o32, T4
|
||||
stxvd2x vs35, o48, T4
|
||||
|
||||
addi CO, CO, 128
|
||||
|
||||
stxvd2x vs36, o64, T4
|
||||
stxvd2x vs37, o80, T4
|
||||
stxvd2x vs38, o96, T4
|
||||
stxvd2x vs39, o112, T4
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
/*********************************************************************
|
||||
|
|
|
@ -0,0 +1,228 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define A r5
|
||||
#define LDA r6
|
||||
#define B r7
|
||||
|
||||
#define A0 r8
|
||||
#define A1 r9
|
||||
#define A2 r10
|
||||
#define A3 r11
|
||||
|
||||
#define J r12
|
||||
|
||||
#define PREA r14
|
||||
#define PREB r15
|
||||
#define BO r16
|
||||
#define o64 r17
|
||||
#define o80 r18
|
||||
#define o96 r19
|
||||
#define o112 r20
|
||||
#define o8 r21
|
||||
#define T2 r22
|
||||
#define I r23
|
||||
#define o16 r24
|
||||
#define o32 r25
|
||||
#define o48 r26
|
||||
#define NOTU1 r27
|
||||
#define NOTU2 r30
|
||||
#define T1 r31
|
||||
|
||||
#define o0 0
|
||||
|
||||
#include "dgemm_ncopy_macros_4_power8.S"
|
||||
|
||||
#define STACKSIZE 384
|
||||
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble- L999
|
||||
cmpwi cr0, N, 0
|
||||
ble- L999
|
||||
|
||||
slwi LDA, LDA, BASE_SHIFT
|
||||
|
||||
li PREA, 384
|
||||
li PREB, 384
|
||||
|
||||
li o8, 8
|
||||
li o16, 16
|
||||
li o32, 32
|
||||
li o48, 48
|
||||
li o64, 64
|
||||
li o80, 80
|
||||
li o96, 96
|
||||
li o112, 112
|
||||
|
||||
#include "dgemm_ncopy_logic_4_power8.S"
|
||||
|
||||
L999:
|
||||
|
||||
li r3, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
|
||||
blr
|
||||
EPILOGUE
|
||||
|
||||
|
|
@ -0,0 +1,237 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
mr BO, B
|
||||
srawi. I, N, 2
|
||||
ble DCOPYN_L2_BEGIN
|
||||
|
||||
|
||||
DCOPYN_L4_BEGIN:
|
||||
|
||||
|
||||
DCOPYN_L4_LOOP:
|
||||
|
||||
mr A0, A
|
||||
add A1, A0, LDA
|
||||
add A2, A1, LDA
|
||||
add A3, A2, LDA
|
||||
add A, A3, LDA
|
||||
|
||||
DCOPYN_L4x16_BEGIN:
|
||||
|
||||
srawi. J, M, 4
|
||||
ble DCOPYN_L4x16_END
|
||||
|
||||
DCOPYN_L4x16_LOOP:
|
||||
|
||||
dcbt A0, PREA
|
||||
dcbt A1, PREA
|
||||
dcbt A2, PREA
|
||||
dcbt A3, PREA
|
||||
COPY_4x16
|
||||
addic. J, J, -1
|
||||
bgt DCOPYN_L4x16_LOOP
|
||||
|
||||
DCOPYN_L4x16_END:
|
||||
|
||||
|
||||
DCOPYN_L4x8_BEGIN:
|
||||
|
||||
andi. J, M, 8
|
||||
ble DCOPYN_L4x8_END
|
||||
COPY_4x8
|
||||
|
||||
DCOPYN_L4x8_END:
|
||||
|
||||
|
||||
DCOPYN_L4x4_BEGIN:
|
||||
|
||||
andi. J, M, 4
|
||||
ble DCOPYN_L4x4_END
|
||||
COPY_4x4
|
||||
|
||||
DCOPYN_L4x4_END:
|
||||
|
||||
|
||||
DCOPYN_L4x2_BEGIN:
|
||||
|
||||
andi. J, M, 2
|
||||
ble DCOPYN_L4x2_END
|
||||
COPY_4x2
|
||||
|
||||
DCOPYN_L4x2_END:
|
||||
|
||||
|
||||
DCOPYN_L4x1_BEGIN:
|
||||
|
||||
andi. J, M, 1
|
||||
ble DCOPYN_L4x1_END
|
||||
COPY_4x1
|
||||
|
||||
DCOPYN_L4x1_END:
|
||||
|
||||
|
||||
DCOPYN_L4_END:
|
||||
|
||||
addic. I, I, -1
|
||||
bgt DCOPYN_L4_LOOP
|
||||
|
||||
DCOPYN_L2_BEGIN:
|
||||
|
||||
andi. T1, 4, 2
|
||||
ble DCOPYN_L2_END
|
||||
|
||||
DCOPYN_L2_LOOP:
|
||||
|
||||
mr A0, A
|
||||
add A1, A0, LDA
|
||||
add A, A1, LDA
|
||||
|
||||
DCOPYN_L2x16_BEGIN:
|
||||
|
||||
srawi. J, M, 4
|
||||
ble DCOPYN_L2x16_END
|
||||
|
||||
DCOPYN_L2x16_LOOP:
|
||||
|
||||
COPY_2x16
|
||||
addic. J, J, -1
|
||||
bgt DCOPYN_L2x16_LOOP
|
||||
|
||||
DCOPYN_L2x16_END:
|
||||
|
||||
|
||||
DCOPYN_L2x8_BEGIN:
|
||||
|
||||
andi. J, M, 8
|
||||
ble DCOPYN_L2x8_END
|
||||
COPY_2x8
|
||||
|
||||
DCOPYN_L2x8_END:
|
||||
|
||||
|
||||
DCOPYN_L2x4_BEGIN:
|
||||
|
||||
andi. J, M, 4
|
||||
ble DCOPYN_L2x4_END
|
||||
COPY_2x4
|
||||
|
||||
DCOPYN_L2x4_END:
|
||||
|
||||
|
||||
DCOPYN_L2x2_BEGIN:
|
||||
|
||||
andi. J, M, 2
|
||||
ble DCOPYN_L2x2_END
|
||||
COPY_2x2
|
||||
|
||||
DCOPYN_L2x2_END:
|
||||
|
||||
|
||||
DCOPYN_L2x1_BEGIN:
|
||||
|
||||
andi. J, M, 1
|
||||
ble DCOPYN_L2x1_END
|
||||
COPY_2x1
|
||||
|
||||
DCOPYN_L2x1_END:
|
||||
|
||||
|
||||
DCOPYN_L2_END:
|
||||
|
||||
|
||||
DCOPYN_L1_BEGIN:
|
||||
|
||||
andi. T1, 4, 1
|
||||
ble DCOPYN_L1_END
|
||||
|
||||
DCOPYN_L1_LOOP:
|
||||
|
||||
mr A0, A
|
||||
add A, A0, LDA
|
||||
|
||||
DCOPYN_L1x16_BEGIN:
|
||||
|
||||
srawi. J, M, 4
|
||||
ble DCOPYN_L1x16_END
|
||||
|
||||
DCOPYN_L1x16_LOOP:
|
||||
|
||||
COPY_1x16
|
||||
addic. J, J, -1
|
||||
bgt DCOPYN_L1x16_LOOP
|
||||
|
||||
DCOPYN_L1x16_END:
|
||||
|
||||
|
||||
DCOPYN_L1x8_BEGIN:
|
||||
|
||||
andi. J, M, 8
|
||||
ble DCOPYN_L1x8_END
|
||||
COPY_1x8
|
||||
|
||||
DCOPYN_L1x8_END:
|
||||
|
||||
|
||||
DCOPYN_L1x4_BEGIN:
|
||||
|
||||
andi. J, M, 4
|
||||
ble DCOPYN_L1x4_END
|
||||
COPY_1x4
|
||||
|
||||
DCOPYN_L1x4_END:
|
||||
|
||||
|
||||
DCOPYN_L1x2_BEGIN:
|
||||
|
||||
andi. J, M, 2
|
||||
ble DCOPYN_L1x2_END
|
||||
COPY_1x2
|
||||
|
||||
DCOPYN_L1x2_END:
|
||||
|
||||
|
||||
DCOPYN_L1x1_BEGIN:
|
||||
|
||||
andi. J, M, 1
|
||||
ble DCOPYN_L1x1_END
|
||||
COPY_1x1
|
||||
|
||||
DCOPYN_L1x1_END:
|
||||
|
||||
|
||||
DCOPYN_L1_END:
|
||||
|
|
@ -0,0 +1,691 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=16
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_4x16
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs8, o0, A1
|
||||
lxvd2x vs24, o0, A3
|
||||
lxvd2x vs16, o0, A2
|
||||
|
||||
lxvd2x vs1, o16, A0
|
||||
lxvd2x vs9, o16, A1
|
||||
lxvd2x vs17, o16, A2
|
||||
lxvd2x vs25, o16, A3
|
||||
|
||||
lxvd2x vs2, o32, A0
|
||||
lxvd2x vs10, o32, A1
|
||||
lxvd2x vs18, o32, A2
|
||||
lxvd2x vs26, o32, A3
|
||||
|
||||
lxvd2x vs3, o48, A0
|
||||
lxvd2x vs11, o48, A1
|
||||
lxvd2x vs19, o48, A2
|
||||
lxvd2x vs27, o48, A3
|
||||
|
||||
lxvd2x vs4, o64, A0
|
||||
lxvd2x vs12, o64, A1
|
||||
lxvd2x vs20, o64, A2
|
||||
lxvd2x vs28, o64, A3
|
||||
|
||||
lxvd2x vs5, o80, A0
|
||||
lxvd2x vs13, o80, A1
|
||||
lxvd2x vs21, o80, A2
|
||||
lxvd2x vs29, o80, A3
|
||||
|
||||
lxvd2x vs6, o96, A0
|
||||
lxvd2x vs14, o96, A1
|
||||
lxvd2x vs22, o96, A2
|
||||
lxvd2x vs30, o96, A3
|
||||
|
||||
lxvd2x vs7, o112, A0
|
||||
lxvd2x vs15, o112, A1
|
||||
lxvd2x vs23, o112, A2
|
||||
lxvd2x vs31, o112, A3
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
|
||||
xxpermdi vs36, vs1, vs9, 0
|
||||
xxpermdi vs37, vs17, vs25, 0
|
||||
xxpermdi vs38, vs1, vs9, 3
|
||||
xxpermdi vs39, vs17, vs25, 3
|
||||
|
||||
xxpermdi vs40, vs2, vs10, 0
|
||||
xxpermdi vs41, vs18, vs26, 0
|
||||
xxpermdi vs42, vs2, vs10, 3
|
||||
xxpermdi vs43, vs18, vs26, 3
|
||||
|
||||
xxpermdi vs44, vs3, vs11, 0
|
||||
xxpermdi vs45, vs19, vs27, 0
|
||||
xxpermdi vs46, vs3, vs11, 3
|
||||
xxpermdi vs47, vs19, vs27, 3
|
||||
|
||||
xxpermdi vs48, vs4, vs12, 0
|
||||
xxpermdi vs49, vs20, vs28, 0
|
||||
xxpermdi vs50, vs4, vs12, 3
|
||||
xxpermdi vs51, vs20, vs28, 3
|
||||
|
||||
xxpermdi vs52, vs5, vs13, 0
|
||||
xxpermdi vs53, vs21, vs29, 0
|
||||
xxpermdi vs54, vs5, vs13, 3
|
||||
xxpermdi vs55, vs21, vs29, 3
|
||||
|
||||
addi A0, A0, 128
|
||||
addi A1, A1, 128
|
||||
|
||||
xxpermdi vs56, vs6, vs14, 0
|
||||
xxpermdi vs57, vs22, vs30, 0
|
||||
xxpermdi vs58, vs6, vs14, 3
|
||||
xxpermdi vs59, vs22, vs30, 3
|
||||
|
||||
addi A3, A3, 128
|
||||
addi A2, A2, 128
|
||||
|
||||
xxpermdi vs60, vs7, vs15, 0
|
||||
xxpermdi vs61, vs23, vs31, 0
|
||||
xxpermdi vs62, vs7, vs15, 3
|
||||
xxpermdi vs63, vs23, vs31, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
stxvd2x vs35, o48, BO
|
||||
stxvd2x vs36, o64, BO
|
||||
stxvd2x vs37, o80, BO
|
||||
stxvd2x vs38, o96, BO
|
||||
stxvd2x vs39, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
stxvd2x vs40, o0, BO
|
||||
stxvd2x vs41, o16, BO
|
||||
stxvd2x vs42, o32, BO
|
||||
stxvd2x vs43, o48, BO
|
||||
stxvd2x vs44, o64, BO
|
||||
stxvd2x vs45, o80, BO
|
||||
stxvd2x vs46, o96, BO
|
||||
stxvd2x vs47, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
stxvd2x vs48, o0, BO
|
||||
stxvd2x vs49, o16, BO
|
||||
stxvd2x vs50, o32, BO
|
||||
stxvd2x vs51, o48, BO
|
||||
stxvd2x vs52, o64, BO
|
||||
stxvd2x vs53, o80, BO
|
||||
stxvd2x vs54, o96, BO
|
||||
stxvd2x vs55, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
stxvd2x vs56, o0, BO
|
||||
stxvd2x vs57, o16, BO
|
||||
stxvd2x vs58, o32, BO
|
||||
stxvd2x vs59, o48, BO
|
||||
stxvd2x vs60, o64, BO
|
||||
stxvd2x vs61, o80, BO
|
||||
stxvd2x vs62, o96, BO
|
||||
stxvd2x vs63, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=8
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_4x8
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
lxvd2x vs2, o32, A0
|
||||
lxvd2x vs3, o48, A0
|
||||
addi A0, A0, 64
|
||||
|
||||
|
||||
lxvd2x vs8, o0, A1
|
||||
lxvd2x vs9, o16, A1
|
||||
lxvd2x vs10, o32, A1
|
||||
lxvd2x vs11, o48, A1
|
||||
addi A1, A1, 64
|
||||
|
||||
|
||||
lxvd2x vs16, o0, A2
|
||||
lxvd2x vs17, o16, A2
|
||||
lxvd2x vs18, o32, A2
|
||||
lxvd2x vs19, o48, A2
|
||||
addi A2, A2, 64
|
||||
|
||||
|
||||
lxvd2x vs24, o0, A3
|
||||
lxvd2x vs25, o16, A3
|
||||
lxvd2x vs26, o32, A3
|
||||
lxvd2x vs27, o48, A3
|
||||
addi A3, A3, 64
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
|
||||
xxpermdi vs36, vs1, vs9, 0
|
||||
xxpermdi vs37, vs17, vs25, 0
|
||||
xxpermdi vs38, vs1, vs9, 3
|
||||
xxpermdi vs39, vs17, vs25, 3
|
||||
|
||||
xxpermdi vs40, vs2, vs10, 0
|
||||
xxpermdi vs41, vs18, vs26, 0
|
||||
xxpermdi vs42, vs2, vs10, 3
|
||||
xxpermdi vs43, vs18, vs26, 3
|
||||
|
||||
xxpermdi vs44, vs3, vs11, 0
|
||||
xxpermdi vs45, vs19, vs27, 0
|
||||
xxpermdi vs46, vs3, vs11, 3
|
||||
xxpermdi vs47, vs19, vs27, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
stxvd2x vs35, o48, BO
|
||||
stxvd2x vs36, o64, BO
|
||||
stxvd2x vs37, o80, BO
|
||||
stxvd2x vs38, o96, BO
|
||||
stxvd2x vs39, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
stxvd2x vs40, o0, BO
|
||||
stxvd2x vs41, o16, BO
|
||||
stxvd2x vs42, o32, BO
|
||||
stxvd2x vs43, o48, BO
|
||||
stxvd2x vs44, o64, BO
|
||||
stxvd2x vs45, o80, BO
|
||||
stxvd2x vs46, o96, BO
|
||||
stxvd2x vs47, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=4
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_4x4
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
addi A0, A0, 32
|
||||
|
||||
|
||||
lxvd2x vs8, o0, A1
|
||||
lxvd2x vs9, o16, A1
|
||||
addi A1, A1, 32
|
||||
|
||||
|
||||
lxvd2x vs16, o0, A2
|
||||
lxvd2x vs17, o16, A2
|
||||
addi A2, A2, 32
|
||||
|
||||
|
||||
lxvd2x vs24, o0, A3
|
||||
lxvd2x vs25, o16, A3
|
||||
addi A3, A3, 32
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
|
||||
xxpermdi vs36, vs1, vs9, 0
|
||||
xxpermdi vs37, vs17, vs25, 0
|
||||
xxpermdi vs38, vs1, vs9, 3
|
||||
xxpermdi vs39, vs17, vs25, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
stxvd2x vs35, o48, BO
|
||||
stxvd2x vs36, o64, BO
|
||||
stxvd2x vs37, o80, BO
|
||||
stxvd2x vs38, o96, BO
|
||||
stxvd2x vs39, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=2
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_4x2
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
addi A0, A0, 16
|
||||
|
||||
|
||||
lxvd2x vs8, o0, A1
|
||||
addi A1, A1, 16
|
||||
|
||||
|
||||
lxvd2x vs16, o0, A2
|
||||
addi A2, A2, 16
|
||||
|
||||
|
||||
lxvd2x vs24, o0, A3
|
||||
addi A3, A3, 16
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
stxvd2x vs35, o48, BO
|
||||
addi BO, BO, 64
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=1
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_4x1
|
||||
|
||||
lxsdx vs0, o0, A0
|
||||
addi A0, A0, 8
|
||||
|
||||
|
||||
lxsdx vs8, o0, A1
|
||||
addi A1, A1, 8
|
||||
|
||||
|
||||
lxsdx vs16, o0, A2
|
||||
addi A2, A2, 8
|
||||
|
||||
|
||||
lxsdx vs24, o0, A3
|
||||
addi A3, A3, 8
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
addi BO, BO, 32
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=16
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_2x16
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
lxvd2x vs2, o32, A0
|
||||
lxvd2x vs3, o48, A0
|
||||
lxvd2x vs4, o64, A0
|
||||
lxvd2x vs5, o80, A0
|
||||
lxvd2x vs6, o96, A0
|
||||
lxvd2x vs7, o112, A0
|
||||
addi A0, A0, 128
|
||||
|
||||
|
||||
lxvd2x vs8, o0, A1
|
||||
lxvd2x vs9, o16, A1
|
||||
lxvd2x vs10, o32, A1
|
||||
lxvd2x vs11, o48, A1
|
||||
lxvd2x vs12, o64, A1
|
||||
lxvd2x vs13, o80, A1
|
||||
lxvd2x vs14, o96, A1
|
||||
lxvd2x vs15, o112, A1
|
||||
addi A1, A1, 128
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs0, vs8, 3
|
||||
|
||||
xxpermdi vs34, vs1, vs9, 0
|
||||
xxpermdi vs35, vs1, vs9, 3
|
||||
|
||||
xxpermdi vs36, vs2, vs10, 0
|
||||
xxpermdi vs37, vs2, vs10, 3
|
||||
|
||||
xxpermdi vs38, vs3, vs11, 0
|
||||
xxpermdi vs39, vs3, vs11, 3
|
||||
|
||||
xxpermdi vs40, vs4, vs12, 0
|
||||
xxpermdi vs41, vs4, vs12, 3
|
||||
|
||||
xxpermdi vs42, vs5, vs13, 0
|
||||
xxpermdi vs43, vs5, vs13, 3
|
||||
|
||||
xxpermdi vs44, vs6, vs14, 0
|
||||
xxpermdi vs45, vs6, vs14, 3
|
||||
|
||||
xxpermdi vs46, vs7, vs15, 0
|
||||
xxpermdi vs47, vs7, vs15, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
stxvd2x vs35, o48, BO
|
||||
stxvd2x vs36, o64, BO
|
||||
stxvd2x vs37, o80, BO
|
||||
stxvd2x vs38, o96, BO
|
||||
stxvd2x vs39, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
stxvd2x vs40, o0, BO
|
||||
stxvd2x vs41, o16, BO
|
||||
stxvd2x vs42, o32, BO
|
||||
stxvd2x vs43, o48, BO
|
||||
stxvd2x vs44, o64, BO
|
||||
stxvd2x vs45, o80, BO
|
||||
stxvd2x vs46, o96, BO
|
||||
stxvd2x vs47, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=8
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_2x8
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
lxvd2x vs2, o32, A0
|
||||
lxvd2x vs3, o48, A0
|
||||
addi A0, A0, 64
|
||||
|
||||
|
||||
lxvd2x vs8, o0, A1
|
||||
lxvd2x vs9, o16, A1
|
||||
lxvd2x vs10, o32, A1
|
||||
lxvd2x vs11, o48, A1
|
||||
addi A1, A1, 64
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs0, vs8, 3
|
||||
|
||||
xxpermdi vs34, vs1, vs9, 0
|
||||
xxpermdi vs35, vs1, vs9, 3
|
||||
|
||||
xxpermdi vs36, vs2, vs10, 0
|
||||
xxpermdi vs37, vs2, vs10, 3
|
||||
|
||||
xxpermdi vs38, vs3, vs11, 0
|
||||
xxpermdi vs39, vs3, vs11, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
stxvd2x vs35, o48, BO
|
||||
stxvd2x vs36, o64, BO
|
||||
stxvd2x vs37, o80, BO
|
||||
stxvd2x vs38, o96, BO
|
||||
stxvd2x vs39, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=4
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_2x4
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
addi A0, A0, 32
|
||||
|
||||
|
||||
lxvd2x vs8, o0, A1
|
||||
lxvd2x vs9, o16, A1
|
||||
addi A1, A1, 32
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs0, vs8, 3
|
||||
|
||||
xxpermdi vs34, vs1, vs9, 0
|
||||
xxpermdi vs35, vs1, vs9, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
stxvd2x vs35, o48, BO
|
||||
addi BO, BO, 64
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=2
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_2x2
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
addi A0, A0, 16
|
||||
|
||||
|
||||
lxvd2x vs8, o0, A1
|
||||
addi A1, A1, 16
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs0, vs8, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
addi BO, BO, 32
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=1
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_2x1
|
||||
|
||||
lxsdx vs0, o0, A0
|
||||
addi A0, A0, 8
|
||||
|
||||
|
||||
lxsdx vs8, o0, A1
|
||||
addi A1, A1, 8
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
addi BO, BO, 16
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=16
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_1x16
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
lxvd2x vs2, o32, A0
|
||||
lxvd2x vs3, o48, A0
|
||||
lxvd2x vs4, o64, A0
|
||||
lxvd2x vs5, o80, A0
|
||||
lxvd2x vs6, o96, A0
|
||||
lxvd2x vs7, o112, A0
|
||||
addi A0, A0, 128
|
||||
|
||||
|
||||
stxvd2x vs0, o0, BO
|
||||
stxvd2x vs1, o16, BO
|
||||
stxvd2x vs2, o32, BO
|
||||
stxvd2x vs3, o48, BO
|
||||
addi BO, BO, 64
|
||||
|
||||
stxvd2x vs4, o0, BO
|
||||
stxvd2x vs5, o16, BO
|
||||
stxvd2x vs6, o32, BO
|
||||
stxvd2x vs7, o48, BO
|
||||
addi BO, BO, 64
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=8
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_1x8
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
lxvd2x vs2, o32, A0
|
||||
lxvd2x vs3, o48, A0
|
||||
addi A0, A0, 64
|
||||
|
||||
|
||||
stxvd2x vs0, o0, BO
|
||||
stxvd2x vs1, o16, BO
|
||||
stxvd2x vs2, o32, BO
|
||||
stxvd2x vs3, o48, BO
|
||||
addi BO, BO, 64
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=4
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_1x4
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs1, o16, A0
|
||||
addi A0, A0, 32
|
||||
|
||||
|
||||
stxvd2x vs0, o0, BO
|
||||
stxvd2x vs1, o16, BO
|
||||
addi BO, BO, 32
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=2
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_1x2
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
addi A0, A0, 16
|
||||
|
||||
|
||||
stxvd2x vs0, o0, BO
|
||||
addi BO, BO, 16
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=1
|
||||
**********************************************************************************************/
|
||||
|
||||
.macro COPY_1x1
|
||||
|
||||
lxsdx vs0, o0, A0
|
||||
addi A0, A0, 8
|
||||
|
||||
|
||||
stxsdx vs0, o0, BO
|
||||
addi BO, BO, 8
|
||||
|
||||
|
||||
.endm
|
||||
|
Loading…
Reference in New Issue