optimized dgemm for POWER8

This commit is contained in:
Werner Saar 2016-04-29 12:52:47 +02:00
parent 6f43310de5
commit 56948dbf0f
7 changed files with 1524 additions and 249 deletions

View File

@ -21,7 +21,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMONCOPY = dgemm_ncopy_4_power8.S
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o

View File

@ -134,13 +134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T4 r12
#define T3 r11
#define o40 r12
#define o56 r11
#define o112 r14
#define o8 r15
#define o24 r16
#define ALPHA r17
#define o64 r17
#define L r18
#define T1 r19
#define KK r20
#define BB r21
#define o80 r20
#define o96 r21
#define I r22
#define J r23
#define AO r24
@ -205,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
@ -223,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
#endif
stfd f1, ALPHA_SP
@ -263,9 +269,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ble .L999_H1
#ifdef __64BIT__
addi ALPHA, SP, 296
addi T1, SP, 296
#else
addi ALPHA, SP, 224
addi T1, SP, 224
#endif
li PRE, 384
@ -274,8 +280,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
li o24, 24
li o32, 32
li o48, 48
li o64, 64
li o80, 80
li o96, 96
li o112, 112
lxvdsx alpha_r, 0, ALPHA
lxvdsx alpha_r, 0, T1
#include "dgemm_logic_16x4_power8.S"
@ -323,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
@ -341,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
#endif
addi SP, SP, STACKSIZE

View File

@ -46,23 +46,28 @@ LDGEMM_L4_BEGIN:
srawi. I, M, 4
ble LDGEMM_L4x16_END
.align 5
.align 4
LDGEMM_L4x16_BEGIN:
li T4, -128
li L, -128
and T1, CO, T4
mr T1, CO
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
and T1, T1, L
and T2, T2, L
and T3, T3, L
and T4, T4, L
dcbt T1, r0
dcbt T2, r0
dcbt T3, r0
dcbt T4, r0
andi. cr0, CO, 127
ble LDGEMM_L4x16_BEGIN_NOPRE
mr BO, B
srawi. L, K, 1
addi T1, T1, 128
addi T2, T2, 128
@ -74,55 +79,43 @@ LDGEMM_L4x16_BEGIN:
dcbt T3, r0
dcbt T4, r0
LDGEMM_L4x16_BEGIN_NOPRE:
mr BO, B
srawi. L, K, 2
ble LDGEMM_L4x16_SUB0
cmpwi cr0, L, 1
ble LDGEMM_L4x16_SUB4
.align 5
.align 4
LDGEMM_L4x16_LOOP_START:
li o40, 40
li o56, 56
dcbt AO, PRE
LOAD4x16_1
dcbt AO, PRE
KERNEL4x16_I1
dcbt AO, PRE
KERNEL4x16_2
dcbt AO, PRE
KERNEL4x16_1
dcbt AO, PRE
KERNEL4x16_2
addic. L, L, -2
KERNEL4x16_L2
ble LDGEMM_L4x16_LOOP_END
.align 7
.align 4
LDGEMM_L4x16_LOOP:
dcbt AO, PRE
KERNEL4x16_1
dcbt AO, PRE
KERNEL4x16_2
dcbt AO, PRE
KERNEL4x16_1
dcbt AO, PRE
KERNEL4x16_2
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
addic. L, L, -1
KERNEL4x16_L2
bgt LDGEMM_L4x16_LOOP
.align 5
.align 4
LDGEMM_L4x16_LOOP_END:
dcbt AO, PRE
KERNEL4x16_1
dcbt AO, PRE
KERNEL4x16_2
KERNEL4x16_1
KERNEL4x16_E2
@ -132,14 +125,12 @@ LDGEMM_L4x16_SUB4:
KERNEL4x16_SUBI1
KERNEL4x16_SUB1
KERNEL4x16_SUB1
KERNEL4x16_SUB1
b LDGEMM_L4x16_SUB1
LDGEMM_L4x16_SUB0:
andi. L, K, 3
andi. L, K, 1
KERNEL4x16_SUBI1
@ -149,7 +140,7 @@ LDGEMM_L4x16_SUB0:
LDGEMM_L4x16_SUB1:
andi. L, K, 3
andi. L, K, 1
ble LDGEMM_L4x16_SAVE
LDGEMM_L4x16_SUB2:
@ -159,7 +150,7 @@ LDGEMM_L4x16_SUB2:
addic. L, L, -1
bgt LDGEMM_L4x16_SUB2
.align 5
.align 4
LDGEMM_L4x16_SAVE:
SAVE4x16
@ -184,15 +175,20 @@ LDGEMM_L4x8_BEGIN:
LDGEMM_L4x8_LOOP_START:
dcbt AO, PRE
LOAD4x8_1
KERNEL4x8_I1
dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2
addic. L, L, -2
@ -203,13 +199,17 @@ LDGEMM_L4x8_LOOP_START:
LDGEMM_L4x8_LOOP:
KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2
addic. L, L, -1
@ -284,15 +284,18 @@ LDGEMM_L4x4_BEGIN:
LDGEMM_L4x4_LOOP_START:
dcbt AO, PRE
LOAD4x4_1
KERNEL4x4_I1
KERNEL4x4_2
KERNEL4x4_1
dcbt AO, PRE
KERNEL4x4_2
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
dcbt AO, PRE
KERNEL4x4_2
addic. L, L, -2
@ -305,11 +308,13 @@ LDGEMM_L4x4_LOOP:
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
dcbt AO, PRE
KERNEL4x4_2
KERNEL4x4_1
KERNEL4x4_2
KERNEL4x4_1
dcbt AO, PRE
KERNEL4x4_2
addic. L, L, -1
@ -743,15 +748,20 @@ LDGEMM_L2x8_BEGIN:
LDGEMM_L2x8_LOOP_START:
dcbt AO, PRE
LOAD2x8_1
KERNEL2x8_I1
dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -2
@ -762,13 +772,17 @@ LDGEMM_L2x8_LOOP_START:
LDGEMM_L2x8_LOOP:
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -1
@ -1287,15 +1301,20 @@ LDGEMM_L1x8_BEGIN:
LDGEMM_L1x8_LOOP_START:
dcbt AO, PRE
LOAD1x8_1
KERNEL1x8_I1
dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -2
@ -1306,13 +1325,17 @@ LDGEMM_L1x8_LOOP_START:
LDGEMM_L1x8_LOOP:
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -1

View File

@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs24, 0, BO
lxvdsx vs25, o8, BO
addi AO, AO, 64
lxvd2x vs4, 0, AO
lxvd2x vs5, o16, AO
lxvd2x vs6, o32, AO
lxvd2x vs7, o48, AO
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO
lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO
addi AO, AO, 64
addi AO, AO, 128
addi BO, BO, 32
.endm
.macro KERNEL4x16_I1
xvmuldp vs32, vs0, vs24
xvmuldp vs33, vs1, vs24
xvmuldp vs34, vs2, vs24
xvmuldp vs35, vs3, vs24
xvmuldp vs32, vs0, vs24
xvmuldp vs33, vs1, vs24
xvmuldp vs34, vs2, vs24
xvmuldp vs35, vs3, vs24
lxvd2x vs8, 0, AO
lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmuldp vs36, vs4, vs24
xvmuldp vs37, vs5, vs24
xvmuldp vs38, vs6, vs24
xvmuldp vs39, vs7, vs24
xvmuldp vs36, vs4, vs24
xvmuldp vs37, vs5, vs24
xvmuldp vs38, vs6, vs24
xvmuldp vs39, vs7, vs24
lxvdsx vs28, 0, BO
lxvdsx vs29, o8, BO
xvmuldp vs40, vs0, vs25
xvmuldp vs41, vs1, vs25
xvmuldp vs42, vs2, vs25
xvmuldp vs43, vs3, vs25
xvmuldp vs40, vs0, vs25
xvmuldp vs41, vs1, vs25
xvmuldp vs42, vs2, vs25
xvmuldp vs43, vs3, vs25
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmuldp vs44, vs4, vs25
xvmuldp vs45, vs5, vs25
xvmuldp vs46, vs6, vs25
xvmuldp vs47, vs7, vs25
xvmuldp vs44, vs4, vs25
xvmuldp vs45, vs5, vs25
xvmuldp vs46, vs6, vs25
xvmuldp vs47, vs7, vs25
addi AO, AO, 64
xvmuldp vs48, vs0, vs26
xvmuldp vs49, vs1, vs26
xvmuldp vs50, vs2, vs26
xvmuldp vs51, vs3, vs26
xvmuldp vs48, vs0, vs26
xvmuldp vs49, vs1, vs26
xvmuldp vs50, vs2, vs26
xvmuldp vs51, vs3, vs26
lxvd2x vs12, 0, AO
lxvd2x vs13, o16, AO
lxvd2x vs12, o64, AO
lxvd2x vs13, o80, AO
xvmuldp vs52, vs4, vs26
xvmuldp vs53, vs5, vs26
xvmuldp vs54, vs6, vs26
xvmuldp vs55, vs7, vs26
xvmuldp vs52, vs4, vs26
xvmuldp vs53, vs5, vs26
xvmuldp vs54, vs6, vs26
xvmuldp vs55, vs7, vs26
lxvd2x vs14, o32, AO
lxvd2x vs15, o48, AO
lxvd2x vs14, o96, AO
lxvd2x vs15, o112, AO
xvmuldp vs56, vs0, vs27
xvmuldp vs57, vs1, vs27
xvmuldp vs58, vs2, vs27
xvmuldp vs59, vs3, vs27
xvmuldp vs56, vs0, vs27
xvmuldp vs57, vs1, vs27
xvmuldp vs58, vs2, vs27
xvmuldp vs59, vs3, vs27
lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO
xvmuldp vs60, vs4, vs27
xvmuldp vs61, vs5, vs27
xvmuldp vs62, vs6, vs27
xvmuldp vs63, vs7, vs27
xvmuldp vs60, vs4, vs27
xvmuldp vs61, vs5, vs27
xvmuldp vs62, vs6, vs27
xvmuldp vs63, vs7, vs27
addi AO, AO, 64
addi BO, BO, 32
addi AO, AO, 128
.endm
.macro KERNEL4x16_1
xvmaddadp vs32, vs0, vs24
@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24
lxvd2x vs8, 0, AO
lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24
@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25
addi AO, AO, 64
xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
lxvd2x vs12, 0, AO
lxvd2x vs13, o16, AO
lxvd2x vs12, o64, AO
lxvd2x vs13, o80, AO
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26
lxvd2x vs14, o32, AO
lxvd2x vs15, o48, AO
lxvd2x vs14, o96, AO
lxvd2x vs15, o112, AO
xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27
@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27
addi AO, AO, 64
addi AO, AO, 128
addi BO, BO, 32
.endm
@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29
addi AO, AO, 64
xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30
lxvd2x vs4, 0, AO
lxvd2x vs5, o16, AO
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30
lxvd2x vs6, o32, AO
lxvd2x vs7, o48, AO
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO
xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31
@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs62, vs14, vs31
xvmaddadp vs63, vs15, vs31
addi AO, AO, 64
addi AO, AO, 128
addi BO, BO, 32
.endm
.macro KERNEL4x16_L1
xvmaddadp vs32, vs0, vs24
xvmaddadp vs33, vs1, vs24
xvmaddadp vs34, vs2, vs24
xvmaddadp vs35, vs3, vs24
lxvd2x vs8, o0, AO
lxvd2x vs9, o16, AO
lxvd2x vs10, o32, AO
lxvd2x vs11, o48, AO
xvmaddadp vs36, vs4, vs24
xvmaddadp vs37, vs5, vs24
xvmaddadp vs38, vs6, vs24
xvmaddadp vs39, vs7, vs24
lxvdsx vs28, 0, BO
lxvdsx vs29, o8, BO
xvmaddadp vs40, vs0, vs25
xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
xvmaddadp vs47, vs7, vs25
xvmaddadp vs48, vs0, vs26
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
lxvd2x vs12, o64, AO
lxvd2x vs13, o80, AO
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
xvmaddadp vs55, vs7, vs26
lxvd2x vs14, o96, AO
lxvd2x vs15, o112, AO
xvmaddadp vs56, vs0, vs27
xvmaddadp vs57, vs1, vs27
xvmaddadp vs58, vs2, vs27
xvmaddadp vs59, vs3, vs27
lxvdsx vs30, o16, BO
lxvdsx vs31, o24, BO
xvmaddadp vs60, vs4, vs27
xvmaddadp vs61, vs5, vs27
xvmaddadp vs62, vs6, vs27
xvmaddadp vs63, vs7, vs27
addi AO, AO, 128
.endm
.macro KERNEL4x16_L2
xvmaddadp vs32, vs8, vs28
xvmaddadp vs33, vs9, vs28
xvmaddadp vs34, vs10, vs28
xvmaddadp vs35, vs11, vs28
lxvd2x vs0, 0, AO
lxvd2x vs1, o16, AO
xvmaddadp vs36, vs12, vs28
xvmaddadp vs37, vs13, vs28
xvmaddadp vs38, vs14, vs28
xvmaddadp vs39, vs15, vs28
lxvdsx vs24, o32, BO
lxvdsx vs25, o40, BO
xvmaddadp vs40, vs8, vs29
xvmaddadp vs41, vs9, vs29
xvmaddadp vs42, vs10, vs29
xvmaddadp vs43, vs11, vs29
lxvd2x vs2, o32, AO
lxvd2x vs3, o48, AO
xvmaddadp vs44, vs12, vs29
xvmaddadp vs45, vs13, vs29
xvmaddadp vs46, vs14, vs29
xvmaddadp vs47, vs15, vs29
xvmaddadp vs48, vs8, vs30
xvmaddadp vs49, vs9, vs30
xvmaddadp vs50, vs10, vs30
xvmaddadp vs51, vs11, vs30
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
xvmaddadp vs52, vs12, vs30
xvmaddadp vs53, vs13, vs30
xvmaddadp vs54, vs14, vs30
xvmaddadp vs55, vs15, vs30
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO
xvmaddadp vs56, vs8, vs31
xvmaddadp vs57, vs9, vs31
xvmaddadp vs58, vs10, vs31
xvmaddadp vs59, vs11, vs31
lxvdsx vs26, o48, BO
lxvdsx vs27, o56, BO
xvmaddadp vs60, vs12, vs31
addi AO, AO, 128
xvmaddadp vs61, vs13, vs31
xvmaddadp vs62, vs14, vs31
addi BO, BO, 64
xvmaddadp vs63, vs15, vs31
.endm
.macro KERNEL4x16_E2
@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvdsx vs26, o16, BO
lxvdsx vs27, o24, BO
addi AO, AO, 64
addi BO, BO, 32
lxvd2x vs4, 0, AO
lxvd2x vs5, o16, AO
lxvd2x vs6, o32, AO
lxvd2x vs7, o48, AO
lxvd2x vs4, o64, AO
lxvd2x vs5, o80, AO
lxvd2x vs6, o96, AO
lxvd2x vs7, o112, AO
addi AO, AO, 64
xvmaddadp vs32, vs0, vs24
@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs41, vs1, vs25
xvmaddadp vs42, vs2, vs25
xvmaddadp vs43, vs3, vs25
addi BO, BO, 32
xvmaddadp vs44, vs4, vs25
xvmaddadp vs45, vs5, vs25
xvmaddadp vs46, vs6, vs25
@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs49, vs1, vs26
xvmaddadp vs50, vs2, vs26
xvmaddadp vs51, vs3, vs26
addi AO, AO, 128
xvmaddadp vs52, vs4, vs26
xvmaddadp vs53, vs5, vs26
xvmaddadp vs54, vs6, vs26
@ -430,33 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x16
mr T1, CO
addi T2, T1, 64
add T3, T1, LDC
addi T4, T3, 64
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
lxvd2x vs1, o16, T1
lxvd2x vs2, o32, T1
lxvd2x vs3, o48, T1
lxvd2x vs0, 0, CO
lxvd2x vs1, o16, CO
lxvd2x vs2, o32, CO
lxvd2x vs3, o48, CO
lxvd2x vs4, o64, CO
lxvd2x vs5, o80, CO
lxvd2x vs6, o96, CO
lxvd2x vs7, o112, CO
lxvd2x vs4, 0, T2
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
lxvd2x vs8, 0, T2
lxvd2x vs9, o16, T2
lxvd2x vs10, o32, T2
lxvd2x vs11, o48, T2
lxvd2x vs12, o64, T2
lxvd2x vs13, o80, T2
lxvd2x vs14, o96, T2
lxvd2x vs15, o112, T2
lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3
lxvd2x vs24, 0, T3
lxvd2x vs25, o16, T3
lxvd2x vs26, o32, T3
lxvd2x vs27, o48, T3
lxvd2x vs28, o64, T3
lxvd2x vs29, o80, T3
lxvd2x vs30, o96, T3
lxvd2x vs31, o112, T3
lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif
#ifndef TRMMKERNEL
xvmaddadp vs0, vs32, alpha_r
xvmaddadp vs1, vs33, alpha_r
xvmaddadp vs2, vs34, alpha_r
@ -465,139 +599,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
lxvd2x vs32, 0, T4
lxvd2x vs33, o16, T4
lxvd2x vs34, o32, T4
lxvd2x vs35, o48, T4
lxvd2x vs36, o64, T4
lxvd2x vs37, o80, T4
lxvd2x vs38, o96, T4
lxvd2x vs39, o112, T4
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
xvmuldp vs2, vs34, alpha_r
xvmuldp vs3, vs35, alpha_r
xvmuldp vs4, vs36, alpha_r
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
xvmuldp vs8, vs40, alpha_r
xvmuldp vs9, vs41, alpha_r
xvmuldp vs10, vs42, alpha_r
xvmuldp vs11, vs43, alpha_r
xvmuldp vs12, vs44, alpha_r
xvmuldp vs13, vs45, alpha_r
xvmuldp vs14, vs46, alpha_r
xvmuldp vs15, vs47, alpha_r
#endif
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
stxvd2x vs4, o64, T1
stxvd2x vs5, o80, T1
stxvd2x vs6, o96, T1
stxvd2x vs7, o112, T1
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
xvmaddadp vs24, vs48, alpha_r
xvmaddadp vs25, vs49, alpha_r
xvmaddadp vs26, vs50, alpha_r
xvmaddadp vs27, vs51, alpha_r
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3
stxvd2x vs8, o0, T2
stxvd2x vs9, o16, T2
stxvd2x vs10, o32, T2
stxvd2x vs11, o48, T2
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4
xvmaddadp vs28, vs52, alpha_r
xvmaddadp vs29, vs53, alpha_r
xvmaddadp vs30, vs54, alpha_r
xvmaddadp vs31, vs55, alpha_r
slwi T4, LDC, 1
add T1, T1, T4
add T3, T3, T4
addi T2, T1, 64
addi T4, T3, 64
stxvd2x vs12, o64, T2
stxvd2x vs13, o80, T2
stxvd2x vs14, o96, T2
stxvd2x vs15, o112, T2
#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
lxvd2x vs1, o16, T1
lxvd2x vs2, o32, T1
lxvd2x vs3, o48, T1
xvmaddadp vs32, vs56, alpha_r
xvmaddadp vs33, vs57, alpha_r
xvmaddadp vs34, vs58, alpha_r
xvmaddadp vs35, vs59, alpha_r
lxvd2x vs4, 0, T2
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
stxvd2x vs24, 0, T3
stxvd2x vs25, o16, T3
stxvd2x vs26, o32, T3
stxvd2x vs27, o48, T3
lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3
xvmaddadp vs36, vs60, alpha_r
xvmaddadp vs37, vs61, alpha_r
xvmaddadp vs38, vs62, alpha_r
xvmaddadp vs39, vs63, alpha_r
lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif
stxvd2x vs28, o64, T3
stxvd2x vs29, o80, T3
stxvd2x vs30, o96, T3
stxvd2x vs31, o112, T3
#ifndef TRMMKERNEL
xvmaddadp vs0, vs48, alpha_r
xvmaddadp vs1, vs49, alpha_r
xvmaddadp vs2, vs50, alpha_r
xvmaddadp vs3, vs51, alpha_r
xvmaddadp vs4, vs52, alpha_r
xvmaddadp vs5, vs53, alpha_r
xvmaddadp vs6, vs54, alpha_r
xvmaddadp vs7, vs55, alpha_r
xvmaddadp vs8, vs56, alpha_r
xvmaddadp vs9, vs57, alpha_r
xvmaddadp vs10, vs58, alpha_r
xvmaddadp vs11, vs59, alpha_r
xvmaddadp vs12, vs60, alpha_r
xvmaddadp vs13, vs61, alpha_r
xvmaddadp vs14, vs62, alpha_r
xvmaddadp vs15, vs63, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
xvmuldp vs2, vs50, alpha_r
xvmuldp vs3, vs51, alpha_r
xvmuldp vs4, vs52, alpha_r
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
xvmuldp vs8, vs56, alpha_r
xvmuldp vs9, vs57, alpha_r
xvmuldp vs10, vs58, alpha_r
xvmuldp vs11, vs59, alpha_r
xvmuldp vs12, vs60, alpha_r
xvmuldp vs13, vs61, alpha_r
xvmuldp vs14, vs62, alpha_r
xvmuldp vs15, vs63, alpha_r
#endif
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4
stxvd2x vs32, o0, T4
stxvd2x vs33, o16, T4
stxvd2x vs34, o32, T4
stxvd2x vs35, o48, T4
addi CO, CO, 128
stxvd2x vs36, o64, T4
stxvd2x vs37, o80, T4
stxvd2x vs38, o96, T4
stxvd2x vs39, o112, T4
.endm
/*********************************************************************

View File

@ -0,0 +1,228 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define M r3
#define N r4
#define A r5
#define LDA r6
#define B r7
#define A0 r8
#define A1 r9
#define A2 r10
#define A3 r11
#define J r12
#define PREA r14
#define PREB r15
#define BO r16
#define o64 r17
#define o80 r18
#define o96 r19
#define o112 r20
#define o8 r21
#define T2 r22
#define I r23
#define o16 r24
#define o32 r25
#define o48 r26
#define NOTU1 r27
#define NOTU2 r30
#define T1 r31
#define o0 0
#include "dgemm_ncopy_macros_4_power8.S"
#define STACKSIZE 384
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
ble- L999
slwi LDA, LDA, BASE_SHIFT
li PREA, 384
li PREB, 384
li o8, 8
li o16, 16
li o32, 32
li o48, 48
li o64, 64
li o80, 80
li o96, 96
li o112, 112
#include "dgemm_ncopy_logic_4_power8.S"
L999:
li r3, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -0,0 +1,237 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
mr BO, B
srawi. I, N, 2
ble DCOPYN_L2_BEGIN
DCOPYN_L4_BEGIN:
DCOPYN_L4_LOOP:
mr A0, A
add A1, A0, LDA
add A2, A1, LDA
add A3, A2, LDA
add A, A3, LDA
DCOPYN_L4x16_BEGIN:
srawi. J, M, 4
ble DCOPYN_L4x16_END
DCOPYN_L4x16_LOOP:
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
COPY_4x16
addic. J, J, -1
bgt DCOPYN_L4x16_LOOP
DCOPYN_L4x16_END:
DCOPYN_L4x8_BEGIN:
andi. J, M, 8
ble DCOPYN_L4x8_END
COPY_4x8
DCOPYN_L4x8_END:
DCOPYN_L4x4_BEGIN:
andi. J, M, 4
ble DCOPYN_L4x4_END
COPY_4x4
DCOPYN_L4x4_END:
DCOPYN_L4x2_BEGIN:
andi. J, M, 2
ble DCOPYN_L4x2_END
COPY_4x2
DCOPYN_L4x2_END:
DCOPYN_L4x1_BEGIN:
andi. J, M, 1
ble DCOPYN_L4x1_END
COPY_4x1
DCOPYN_L4x1_END:
DCOPYN_L4_END:
addic. I, I, -1
bgt DCOPYN_L4_LOOP
DCOPYN_L2_BEGIN:
andi. T1, 4, 2
ble DCOPYN_L2_END
DCOPYN_L2_LOOP:
mr A0, A
add A1, A0, LDA
add A, A1, LDA
DCOPYN_L2x16_BEGIN:
srawi. J, M, 4
ble DCOPYN_L2x16_END
DCOPYN_L2x16_LOOP:
COPY_2x16
addic. J, J, -1
bgt DCOPYN_L2x16_LOOP
DCOPYN_L2x16_END:
DCOPYN_L2x8_BEGIN:
andi. J, M, 8
ble DCOPYN_L2x8_END
COPY_2x8
DCOPYN_L2x8_END:
DCOPYN_L2x4_BEGIN:
andi. J, M, 4
ble DCOPYN_L2x4_END
COPY_2x4
DCOPYN_L2x4_END:
DCOPYN_L2x2_BEGIN:
andi. J, M, 2
ble DCOPYN_L2x2_END
COPY_2x2
DCOPYN_L2x2_END:
DCOPYN_L2x1_BEGIN:
andi. J, M, 1
ble DCOPYN_L2x1_END
COPY_2x1
DCOPYN_L2x1_END:
DCOPYN_L2_END:
DCOPYN_L1_BEGIN:
andi. T1, 4, 1
ble DCOPYN_L1_END
DCOPYN_L1_LOOP:
mr A0, A
add A, A0, LDA
DCOPYN_L1x16_BEGIN:
srawi. J, M, 4
ble DCOPYN_L1x16_END
DCOPYN_L1x16_LOOP:
COPY_1x16
addic. J, J, -1
bgt DCOPYN_L1x16_LOOP
DCOPYN_L1x16_END:
DCOPYN_L1x8_BEGIN:
andi. J, M, 8
ble DCOPYN_L1x8_END
COPY_1x8
DCOPYN_L1x8_END:
DCOPYN_L1x4_BEGIN:
andi. J, M, 4
ble DCOPYN_L1x4_END
COPY_1x4
DCOPYN_L1x4_END:
DCOPYN_L1x2_BEGIN:
andi. J, M, 2
ble DCOPYN_L1x2_END
COPY_1x2
DCOPYN_L1x2_END:
DCOPYN_L1x1_BEGIN:
andi. J, M, 1
ble DCOPYN_L1x1_END
COPY_1x1
DCOPYN_L1x1_END:
DCOPYN_L1_END:

View File

@ -0,0 +1,691 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/28 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
* Macros for N=4 and M=16
**********************************************************************************************/
.macro COPY_4x16
lxvd2x vs0, o0, A0
lxvd2x vs8, o0, A1
lxvd2x vs24, o0, A3
lxvd2x vs16, o0, A2
lxvd2x vs1, o16, A0
lxvd2x vs9, o16, A1
lxvd2x vs17, o16, A2
lxvd2x vs25, o16, A3
lxvd2x vs2, o32, A0
lxvd2x vs10, o32, A1
lxvd2x vs18, o32, A2
lxvd2x vs26, o32, A3
lxvd2x vs3, o48, A0
lxvd2x vs11, o48, A1
lxvd2x vs19, o48, A2
lxvd2x vs27, o48, A3
lxvd2x vs4, o64, A0
lxvd2x vs12, o64, A1
lxvd2x vs20, o64, A2
lxvd2x vs28, o64, A3
lxvd2x vs5, o80, A0
lxvd2x vs13, o80, A1
lxvd2x vs21, o80, A2
lxvd2x vs29, o80, A3
lxvd2x vs6, o96, A0
lxvd2x vs14, o96, A1
lxvd2x vs22, o96, A2
lxvd2x vs30, o96, A3
lxvd2x vs7, o112, A0
lxvd2x vs15, o112, A1
lxvd2x vs23, o112, A2
lxvd2x vs31, o112, A3
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
xxpermdi vs48, vs4, vs12, 0
xxpermdi vs49, vs20, vs28, 0
xxpermdi vs50, vs4, vs12, 3
xxpermdi vs51, vs20, vs28, 3
xxpermdi vs52, vs5, vs13, 0
xxpermdi vs53, vs21, vs29, 0
xxpermdi vs54, vs5, vs13, 3
xxpermdi vs55, vs21, vs29, 3
addi A0, A0, 128
addi A1, A1, 128
xxpermdi vs56, vs6, vs14, 0
xxpermdi vs57, vs22, vs30, 0
xxpermdi vs58, vs6, vs14, 3
xxpermdi vs59, vs22, vs30, 3
addi A3, A3, 128
addi A2, A2, 128
xxpermdi vs60, vs7, vs15, 0
xxpermdi vs61, vs23, vs31, 0
xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128
stxvd2x vs48, o0, BO
stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO
stxvd2x vs51, o48, BO
stxvd2x vs52, o64, BO
stxvd2x vs53, o80, BO
stxvd2x vs54, o96, BO
stxvd2x vs55, o112, BO
addi BO, BO, 128
stxvd2x vs56, o0, BO
stxvd2x vs57, o16, BO
stxvd2x vs58, o32, BO
stxvd2x vs59, o48, BO
stxvd2x vs60, o64, BO
stxvd2x vs61, o80, BO
stxvd2x vs62, o96, BO
stxvd2x vs63, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=4 and M=8
**********************************************************************************************/
.macro COPY_4x8
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
addi A1, A1, 64
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
lxvd2x vs18, o32, A2
lxvd2x vs19, o48, A2
addi A2, A2, 64
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
lxvd2x vs26, o32, A3
lxvd2x vs27, o48, A3
addi A3, A3, 64
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=4 and M=4
**********************************************************************************************/
.macro COPY_4x4
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
addi A1, A1, 32
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
addi A2, A2, 32
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
addi A3, A3, 32
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=4 and M=2
**********************************************************************************************/
.macro COPY_4x2
lxvd2x vs0, o0, A0
addi A0, A0, 16
lxvd2x vs8, o0, A1
addi A1, A1, 16
lxvd2x vs16, o0, A2
addi A2, A2, 16
lxvd2x vs24, o0, A3
addi A3, A3, 16
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=4 and M=1
**********************************************************************************************/
.macro COPY_4x1
lxsdx vs0, o0, A0
addi A0, A0, 8
lxsdx vs8, o0, A1
addi A1, A1, 8
lxsdx vs16, o0, A2
addi A2, A2, 8
lxsdx vs24, o0, A3
addi A3, A3, 8
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
addi BO, BO, 32
.endm
/**********************************************************************************************
* Macros for N=2 and M=16
**********************************************************************************************/
.macro COPY_2x16
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
lxvd2x vs4, o64, A0
lxvd2x vs5, o80, A0
lxvd2x vs6, o96, A0
lxvd2x vs7, o112, A0
addi A0, A0, 128
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
lxvd2x vs12, o64, A1
lxvd2x vs13, o80, A1
lxvd2x vs14, o96, A1
lxvd2x vs15, o112, A1
addi A1, A1, 128
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3
xxpermdi vs36, vs2, vs10, 0
xxpermdi vs37, vs2, vs10, 3
xxpermdi vs38, vs3, vs11, 0
xxpermdi vs39, vs3, vs11, 3
xxpermdi vs40, vs4, vs12, 0
xxpermdi vs41, vs4, vs12, 3
xxpermdi vs42, vs5, vs13, 0
xxpermdi vs43, vs5, vs13, 3
xxpermdi vs44, vs6, vs14, 0
xxpermdi vs45, vs6, vs14, 3
xxpermdi vs46, vs7, vs15, 0
xxpermdi vs47, vs7, vs15, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO
stxvd2x vs43, o48, BO
stxvd2x vs44, o64, BO
stxvd2x vs45, o80, BO
stxvd2x vs46, o96, BO
stxvd2x vs47, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
.macro COPY_2x8
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
lxvd2x vs10, o32, A1
lxvd2x vs11, o48, A1
addi A1, A1, 64
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3
xxpermdi vs36, vs2, vs10, 0
xxpermdi vs37, vs2, vs10, 3
xxpermdi vs38, vs3, vs11, 0
xxpermdi vs39, vs3, vs11, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
stxvd2x vs36, o64, BO
stxvd2x vs37, o80, BO
stxvd2x vs38, o96, BO
stxvd2x vs39, o112, BO
addi BO, BO, 128
.endm
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
.macro COPY_2x4
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32
lxvd2x vs8, o0, A1
lxvd2x vs9, o16, A1
addi A1, A1, 32
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
xxpermdi vs34, vs1, vs9, 0
xxpermdi vs35, vs1, vs9, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
stxvd2x vs35, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
.macro COPY_2x2
lxvd2x vs0, o0, A0
addi A0, A0, 16
lxvd2x vs8, o0, A1
addi A1, A1, 16
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs0, vs8, 3
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
addi BO, BO, 32
.endm
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
.macro COPY_2x1
lxsdx vs0, o0, A0
addi A0, A0, 8
lxsdx vs8, o0, A1
addi A1, A1, 8
xxpermdi vs32, vs0, vs8, 0
stxvd2x vs32, o0, BO
addi BO, BO, 16
.endm
/**********************************************************************************************
* Macros for N=1 and M=16
**********************************************************************************************/
.macro COPY_1x16
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
lxvd2x vs4, o64, A0
lxvd2x vs5, o80, A0
lxvd2x vs6, o96, A0
lxvd2x vs7, o112, A0
addi A0, A0, 128
stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
stxvd2x vs2, o32, BO
stxvd2x vs3, o48, BO
addi BO, BO, 64
stxvd2x vs4, o0, BO
stxvd2x vs5, o16, BO
stxvd2x vs6, o32, BO
stxvd2x vs7, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
.macro COPY_1x8
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
lxvd2x vs2, o32, A0
lxvd2x vs3, o48, A0
addi A0, A0, 64
stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
stxvd2x vs2, o32, BO
stxvd2x vs3, o48, BO
addi BO, BO, 64
.endm
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
.macro COPY_1x4
lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0
addi A0, A0, 32
stxvd2x vs0, o0, BO
stxvd2x vs1, o16, BO
addi BO, BO, 32
.endm
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
.macro COPY_1x2
lxvd2x vs0, o0, A0
addi A0, A0, 16
stxvd2x vs0, o0, BO
addi BO, BO, 16
.endm
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
.macro COPY_1x1
lxsdx vs0, o0, A0
addi A0, A0, 8
stxsdx vs0, o0, BO
addi BO, BO, 8
.endm