diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fb07ccffd..8e3d084aa 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -21,7 +21,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index bcc6ce328..8af7fe389 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -134,13 +134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T4 r12 #define T3 r11 +#define o40 r12 +#define o56 r11 + +#define o112 r14 #define o8 r15 #define o24 r16 -#define ALPHA r17 +#define o64 r17 #define L r18 #define T1 r19 -#define KK r20 -#define BB r21 +#define o80 r20 +#define o96 r21 #define I r22 #define J r23 #define AO r24 @@ -205,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -223,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) + stw r14, 212(SP) #endif stfd f1, ALPHA_SP @@ -263,9 +269,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble .L999_H1 #ifdef __64BIT__ - addi ALPHA, SP, 296 + addi T1, SP, 296 #else - addi ALPHA, SP, 224 + addi T1, SP, 224 #endif li PRE, 384 @@ -274,8 +280,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o24, 24 li o32, 32 li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 - lxvdsx alpha_r, 0, ALPHA + lxvdsx alpha_r, 0, T1 #include "dgemm_logic_16x4_power8.S" @@ -323,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -341,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) + lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 4ad3387e8..718f80bdd 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -46,23 +46,28 @@ LDGEMM_L4_BEGIN: srawi. I, M, 4 ble LDGEMM_L4x16_END - .align 5 + .align 4 LDGEMM_L4x16_BEGIN: - li T4, -128 + li L, -128 - and T1, CO, T4 + mr T1, CO add T2, T1, LDC add T3, T2, LDC add T4, T3, LDC + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 - andi. cr0, CO, 127 - ble LDGEMM_L4x16_BEGIN_NOPRE + mr BO, B + srawi. L, K, 1 addi T1, T1, 128 addi T2, T2, 128 @@ -74,55 +79,43 @@ LDGEMM_L4x16_BEGIN: dcbt T3, r0 dcbt T4, r0 - -LDGEMM_L4x16_BEGIN_NOPRE: - - mr BO, B - srawi. L, K, 2 ble LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L4x16_SUB4 - .align 5 + .align 4 LDGEMM_L4x16_LOOP_START: + li o40, 40 + li o56, 56 + dcbt AO, PRE LOAD4x16_1 dcbt AO, PRE KERNEL4x16_I1 dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - addic. L, L, -2 + KERNEL4x16_L2 + ble LDGEMM_L4x16_LOOP_END - .align 7 + .align 4 LDGEMM_L4x16_LOOP: - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE addic. L, L, -1 + KERNEL4x16_L2 + bgt LDGEMM_L4x16_LOOP - .align 5 + .align 4 + LDGEMM_L4x16_LOOP_END: - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 KERNEL4x16_1 KERNEL4x16_E2 @@ -132,14 +125,12 @@ LDGEMM_L4x16_SUB4: KERNEL4x16_SUBI1 KERNEL4x16_SUB1 - KERNEL4x16_SUB1 - KERNEL4x16_SUB1 b LDGEMM_L4x16_SUB1 LDGEMM_L4x16_SUB0: - andi. L, K, 3 + andi. L, K, 1 KERNEL4x16_SUBI1 @@ -149,7 +140,7 @@ LDGEMM_L4x16_SUB0: LDGEMM_L4x16_SUB1: - andi. L, K, 3 + andi. L, K, 1 ble LDGEMM_L4x16_SAVE LDGEMM_L4x16_SUB2: @@ -159,7 +150,7 @@ LDGEMM_L4x16_SUB2: addic. L, L, -1 bgt LDGEMM_L4x16_SUB2 - .align 5 + .align 4 LDGEMM_L4x16_SAVE: SAVE4x16 @@ -184,15 +175,20 @@ LDGEMM_L4x8_BEGIN: LDGEMM_L4x8_LOOP_START: + dcbt AO, PRE LOAD4x8_1 KERNEL4x8_I1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 @@ -203,13 +199,17 @@ LDGEMM_L4x8_LOOP_START: LDGEMM_L4x8_LOOP: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 @@ -284,15 +284,18 @@ LDGEMM_L4x4_BEGIN: LDGEMM_L4x4_LOOP_START: + dcbt AO, PRE LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -2 @@ -305,11 +308,13 @@ LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 + dcbt AO, PRE KERNEL4x4_2 addic. L, L, -1 @@ -743,15 +748,20 @@ LDGEMM_L2x8_BEGIN: LDGEMM_L2x8_LOOP_START: + dcbt AO, PRE LOAD2x8_1 KERNEL2x8_I1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 @@ -762,13 +772,17 @@ LDGEMM_L2x8_LOOP_START: LDGEMM_L2x8_LOOP: KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 + dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 @@ -1287,15 +1301,20 @@ LDGEMM_L1x8_BEGIN: LDGEMM_L1x8_LOOP_START: + dcbt AO, PRE LOAD1x8_1 KERNEL1x8_I1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 @@ -1306,13 +1325,17 @@ LDGEMM_L1x8_LOOP_START: LDGEMM_L1x8_LOOP: KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 + dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 36531fbe9..2c7851207 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO - addi AO, AO, 64 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm + .macro KERNEL4x16_I1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 - addi AO, AO, 64 - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 - addi AO, AO, 64 - addi BO, BO, 32 + addi AO, AO, 128 .endm + + .macro KERNEL4x16_1 xvmaddadp vs32, vs0, vs24 @@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 - lxvd2x vs8, 0, AO + lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 @@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 - addi AO, AO, 64 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 @@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm @@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 - addi AO, AO, 64 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 @@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 - addi AO, AO, 64 + addi AO, AO, 128 addi BO, BO, 32 .endm +.macro KERNEL4x16_L1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, o0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, o64, AO + lxvd2x vs13, o80, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o96, AO + lxvd2x vs15, o112, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 128 + +.endm + +.macro KERNEL4x16_L2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, o32, BO + lxvdsx vs25, o40, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o48, BO + lxvdsx vs27, o56, BO + + xvmaddadp vs60, vs12, vs31 + addi AO, AO, 128 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + addi BO, BO, 64 + xvmaddadp vs63, vs15, vs31 + + +.endm + + .macro KERNEL4x16_E2 @@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO - addi AO, AO, 64 - addi BO, BO, 32 - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO + lxvd2x vs4, o64, AO + lxvd2x vs5, o80, AO + lxvd2x vs6, o96, AO + lxvd2x vs7, o112, AO - addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 @@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 + addi BO, BO, 32 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 @@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 + addi AO, AO, 128 xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 @@ -430,33 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x16 mr T1, CO - addi T2, T1, 64 - add T3, T1, LDC - addi T4, T3, 64 + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 + lxvd2x vs0, 0, CO + lxvd2x vs1, o16, CO + lxvd2x vs2, o32, CO + lxvd2x vs3, o48, CO + lxvd2x vs4, o64, CO + lxvd2x vs5, o80, CO + lxvd2x vs6, o96, CO + lxvd2x vs7, o112, CO - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 + lxvd2x vs8, 0, T2 + lxvd2x vs9, o16, T2 + lxvd2x vs10, o32, T2 + lxvd2x vs11, o48, T2 + lxvd2x vs12, o64, T2 + lxvd2x vs13, o80, T2 + lxvd2x vs14, o96, T2 + lxvd2x vs15, o112, T2 - lxvd2x vs8, 0, T3 - lxvd2x vs9, o16, T3 - lxvd2x vs10, o32, T3 - lxvd2x vs11, o48, T3 + lxvd2x vs24, 0, T3 + lxvd2x vs25, o16, T3 + lxvd2x vs26, o32, T3 + lxvd2x vs27, o48, T3 + lxvd2x vs28, o64, T3 + lxvd2x vs29, o80, T3 + lxvd2x vs30, o96, T3 + lxvd2x vs31, o112, T3 - lxvd2x vs12, 0, T4 - lxvd2x vs13, o16, T4 - lxvd2x vs14, o32, T4 - lxvd2x vs15, o48, T4 -#endif - -#ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r @@ -465,139 +599,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r + + lxvd2x vs32, 0, T4 + lxvd2x vs33, o16, T4 + lxvd2x vs34, o32, T4 + lxvd2x vs35, o48, T4 + lxvd2x vs36, o64, T4 + lxvd2x vs37, o80, T4 + lxvd2x vs38, o96, T4 + lxvd2x vs39, o112, T4 + xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 + stxvd2x vs4, o64, T1 + stxvd2x vs5, o80, T1 + stxvd2x vs6, o96, T1 + stxvd2x vs7, o112, T1 - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 + xvmaddadp vs24, vs48, alpha_r + xvmaddadp vs25, vs49, alpha_r + xvmaddadp vs26, vs50, alpha_r + xvmaddadp vs27, vs51, alpha_r - stxvd2x vs8, 0, T3 - stxvd2x vs9, o16, T3 - stxvd2x vs10, o32, T3 - stxvd2x vs11, o48, T3 + stxvd2x vs8, o0, T2 + stxvd2x vs9, o16, T2 + stxvd2x vs10, o32, T2 + stxvd2x vs11, o48, T2 - stxvd2x vs12, 0, T4 - stxvd2x vs13, o16, T4 - stxvd2x vs14, o32, T4 - stxvd2x vs15, o48, T4 + xvmaddadp vs28, vs52, alpha_r + xvmaddadp vs29, vs53, alpha_r + xvmaddadp vs30, vs54, alpha_r + xvmaddadp vs31, vs55, alpha_r - slwi T4, LDC, 1 - add T1, T1, T4 - add T3, T3, T4 - addi T2, T1, 64 - addi T4, T3, 64 + stxvd2x vs12, o64, T2 + stxvd2x vs13, o80, T2 + stxvd2x vs14, o96, T2 + stxvd2x vs15, o112, T2 -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 + xvmaddadp vs32, vs56, alpha_r + xvmaddadp vs33, vs57, alpha_r + xvmaddadp vs34, vs58, alpha_r + xvmaddadp vs35, vs59, alpha_r - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 + stxvd2x vs24, 0, T3 + stxvd2x vs25, o16, T3 + stxvd2x vs26, o32, T3 + stxvd2x vs27, o48, T3 - lxvd2x vs8, 0, T3 - lxvd2x vs9, o16, T3 - lxvd2x vs10, o32, T3 - lxvd2x vs11, o48, T3 + xvmaddadp vs36, vs60, alpha_r + xvmaddadp vs37, vs61, alpha_r + xvmaddadp vs38, vs62, alpha_r + xvmaddadp vs39, vs63, alpha_r - lxvd2x vs12, 0, T4 - lxvd2x vs13, o16, T4 - lxvd2x vs14, o32, T4 - lxvd2x vs15, o48, T4 -#endif + stxvd2x vs28, o64, T3 + stxvd2x vs29, o80, T3 + stxvd2x vs30, o96, T3 + stxvd2x vs31, o112, T3 -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r - xvmaddadp vs2, vs50, alpha_r - xvmaddadp vs3, vs51, alpha_r - xvmaddadp vs4, vs52, alpha_r - xvmaddadp vs5, vs53, alpha_r - xvmaddadp vs6, vs54, alpha_r - xvmaddadp vs7, vs55, alpha_r - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r - xvmaddadp vs10, vs58, alpha_r - xvmaddadp vs11, vs59, alpha_r - xvmaddadp vs12, vs60, alpha_r - xvmaddadp vs13, vs61, alpha_r - xvmaddadp vs14, vs62, alpha_r - xvmaddadp vs15, vs63, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r - xvmuldp vs2, vs50, alpha_r - xvmuldp vs3, vs51, alpha_r - xvmuldp vs4, vs52, alpha_r - xvmuldp vs5, vs53, alpha_r - xvmuldp vs6, vs54, alpha_r - xvmuldp vs7, vs55, alpha_r - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r - xvmuldp vs10, vs58, alpha_r - xvmuldp vs11, vs59, alpha_r - xvmuldp vs12, vs60, alpha_r - xvmuldp vs13, vs61, alpha_r - xvmuldp vs14, vs62, alpha_r - xvmuldp vs15, vs63, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - stxvd2x vs8, 0, T3 - stxvd2x vs9, o16, T3 - stxvd2x vs10, o32, T3 - stxvd2x vs11, o48, T3 - - stxvd2x vs12, 0, T4 - stxvd2x vs13, o16, T4 - stxvd2x vs14, o32, T4 - stxvd2x vs15, o48, T4 + stxvd2x vs32, o0, T4 + stxvd2x vs33, o16, T4 + stxvd2x vs34, o32, T4 + stxvd2x vs35, o48, T4 addi CO, CO, 128 + stxvd2x vs36, o64, T4 + stxvd2x vs37, o80, T4 + stxvd2x vs38, o96, T4 + stxvd2x vs39, o112, T4 + + .endm /********************************************************************* diff --git a/kernel/power/dgemm_ncopy_4_power8.S b/kernel/power/dgemm_ncopy_4_power8.S new file mode 100644 index 000000000..31966047f --- /dev/null +++ b/kernel/power/dgemm_ncopy_4_power8.S @@ -0,0 +1,228 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define A0 r8 +#define A1 r9 +#define A2 r10 +#define A3 r11 + +#define J r12 + +#define PREA r14 +#define PREB r15 +#define BO r16 +#define o64 r17 +#define o80 r18 +#define o96 r19 +#define o112 r20 +#define o8 r21 +#define T2 r22 +#define I r23 +#define o16 r24 +#define o32 r25 +#define o48 r26 +#define NOTU1 r27 +#define NOTU2 r30 +#define T1 r31 + +#define o0 0 + +#include "dgemm_ncopy_macros_4_power8.S" + +#define STACKSIZE 384 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + cmpwi cr0, M, 0 + ble- L999 + cmpwi cr0, N, 0 + ble- L999 + + slwi LDA, LDA, BASE_SHIFT + + li PREA, 384 + li PREB, 384 + + li o8, 8 + li o16, 16 + li o32, 32 + li o48, 48 + li o64, 64 + li o80, 80 + li o96, 96 + li o112, 112 + +#include "dgemm_ncopy_logic_4_power8.S" + +L999: + + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + addi SP, SP, STACKSIZE + + blr + EPILOGUE + + diff --git a/kernel/power/dgemm_ncopy_logic_4_power8.S b/kernel/power/dgemm_ncopy_logic_4_power8.S new file mode 100644 index 000000000..6944a7818 --- /dev/null +++ b/kernel/power/dgemm_ncopy_logic_4_power8.S @@ -0,0 +1,237 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + mr BO, B + srawi. I, N, 2 + ble DCOPYN_L2_BEGIN + + +DCOPYN_L4_BEGIN: + + +DCOPYN_L4_LOOP: + + mr A0, A + add A1, A0, LDA + add A2, A1, LDA + add A3, A2, LDA + add A, A3, LDA + +DCOPYN_L4x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L4x16_END + +DCOPYN_L4x16_LOOP: + + dcbt A0, PREA + dcbt A1, PREA + dcbt A2, PREA + dcbt A3, PREA + COPY_4x16 + addic. J, J, -1 + bgt DCOPYN_L4x16_LOOP + +DCOPYN_L4x16_END: + + +DCOPYN_L4x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L4x8_END + COPY_4x8 + +DCOPYN_L4x8_END: + + +DCOPYN_L4x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L4x4_END + COPY_4x4 + +DCOPYN_L4x4_END: + + +DCOPYN_L4x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L4x2_END + COPY_4x2 + +DCOPYN_L4x2_END: + + +DCOPYN_L4x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L4x1_END + COPY_4x1 + +DCOPYN_L4x1_END: + + +DCOPYN_L4_END: + + addic. I, I, -1 + bgt DCOPYN_L4_LOOP + +DCOPYN_L2_BEGIN: + + andi. T1, 4, 2 + ble DCOPYN_L2_END + +DCOPYN_L2_LOOP: + + mr A0, A + add A1, A0, LDA + add A, A1, LDA + +DCOPYN_L2x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L2x16_END + +DCOPYN_L2x16_LOOP: + + COPY_2x16 + addic. J, J, -1 + bgt DCOPYN_L2x16_LOOP + +DCOPYN_L2x16_END: + + +DCOPYN_L2x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L2x8_END + COPY_2x8 + +DCOPYN_L2x8_END: + + +DCOPYN_L2x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L2x4_END + COPY_2x4 + +DCOPYN_L2x4_END: + + +DCOPYN_L2x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L2x2_END + COPY_2x2 + +DCOPYN_L2x2_END: + + +DCOPYN_L2x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L2x1_END + COPY_2x1 + +DCOPYN_L2x1_END: + + +DCOPYN_L2_END: + + +DCOPYN_L1_BEGIN: + + andi. T1, 4, 1 + ble DCOPYN_L1_END + +DCOPYN_L1_LOOP: + + mr A0, A + add A, A0, LDA + +DCOPYN_L1x16_BEGIN: + + srawi. J, M, 4 + ble DCOPYN_L1x16_END + +DCOPYN_L1x16_LOOP: + + COPY_1x16 + addic. J, J, -1 + bgt DCOPYN_L1x16_LOOP + +DCOPYN_L1x16_END: + + +DCOPYN_L1x8_BEGIN: + + andi. J, M, 8 + ble DCOPYN_L1x8_END + COPY_1x8 + +DCOPYN_L1x8_END: + + +DCOPYN_L1x4_BEGIN: + + andi. J, M, 4 + ble DCOPYN_L1x4_END + COPY_1x4 + +DCOPYN_L1x4_END: + + +DCOPYN_L1x2_BEGIN: + + andi. J, M, 2 + ble DCOPYN_L1x2_END + COPY_1x2 + +DCOPYN_L1x2_END: + + +DCOPYN_L1x1_BEGIN: + + andi. J, M, 1 + ble DCOPYN_L1x1_END + COPY_1x1 + +DCOPYN_L1x1_END: + + +DCOPYN_L1_END: + diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S new file mode 100644 index 000000000..9b07d73f5 --- /dev/null +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -0,0 +1,691 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro COPY_4x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs8, o0, A1 + lxvd2x vs24, o0, A3 + lxvd2x vs16, o0, A2 + + lxvd2x vs1, o16, A0 + lxvd2x vs9, o16, A1 + lxvd2x vs17, o16, A2 + lxvd2x vs25, o16, A3 + + lxvd2x vs2, o32, A0 + lxvd2x vs10, o32, A1 + lxvd2x vs18, o32, A2 + lxvd2x vs26, o32, A3 + + lxvd2x vs3, o48, A0 + lxvd2x vs11, o48, A1 + lxvd2x vs19, o48, A2 + lxvd2x vs27, o48, A3 + + lxvd2x vs4, o64, A0 + lxvd2x vs12, o64, A1 + lxvd2x vs20, o64, A2 + lxvd2x vs28, o64, A3 + + lxvd2x vs5, o80, A0 + lxvd2x vs13, o80, A1 + lxvd2x vs21, o80, A2 + lxvd2x vs29, o80, A3 + + lxvd2x vs6, o96, A0 + lxvd2x vs14, o96, A1 + lxvd2x vs22, o96, A2 + lxvd2x vs30, o96, A3 + + lxvd2x vs7, o112, A0 + lxvd2x vs15, o112, A1 + lxvd2x vs23, o112, A2 + lxvd2x vs31, o112, A3 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + xxpermdi vs48, vs4, vs12, 0 + xxpermdi vs49, vs20, vs28, 0 + xxpermdi vs50, vs4, vs12, 3 + xxpermdi vs51, vs20, vs28, 3 + + xxpermdi vs52, vs5, vs13, 0 + xxpermdi vs53, vs21, vs29, 0 + xxpermdi vs54, vs5, vs13, 3 + xxpermdi vs55, vs21, vs29, 3 + + addi A0, A0, 128 + addi A1, A1, 128 + + xxpermdi vs56, vs6, vs14, 0 + xxpermdi vs57, vs22, vs30, 0 + xxpermdi vs58, vs6, vs14, 3 + xxpermdi vs59, vs22, vs30, 3 + + addi A3, A3, 128 + addi A2, A2, 128 + + xxpermdi vs60, vs7, vs15, 0 + xxpermdi vs61, vs23, vs31, 0 + xxpermdi vs62, vs7, vs15, 3 + xxpermdi vs63, vs23, vs31, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + stxvd2x vs48, o0, BO + stxvd2x vs49, o16, BO + stxvd2x vs50, o32, BO + stxvd2x vs51, o48, BO + stxvd2x vs52, o64, BO + stxvd2x vs53, o80, BO + stxvd2x vs54, o96, BO + stxvd2x vs55, o112, BO + addi BO, BO, 128 + + stxvd2x vs56, o0, BO + stxvd2x vs57, o16, BO + stxvd2x vs58, o32, BO + stxvd2x vs59, o48, BO + stxvd2x vs60, o64, BO + stxvd2x vs61, o80, BO + stxvd2x vs62, o96, BO + stxvd2x vs63, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro COPY_4x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + lxvd2x vs18, o32, A2 + lxvd2x vs19, o48, A2 + addi A2, A2, 64 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + lxvd2x vs26, o32, A3 + lxvd2x vs27, o48, A3 + addi A3, A3, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + xxpermdi vs40, vs2, vs10, 0 + xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs42, vs2, vs10, 3 + xxpermdi vs43, vs18, vs26, 3 + + xxpermdi vs44, vs3, vs11, 0 + xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs46, vs3, vs11, 3 + xxpermdi vs47, vs19, vs27, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro COPY_4x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + lxvd2x vs16, o0, A2 + lxvd2x vs17, o16, A2 + addi A2, A2, 32 + + + lxvd2x vs24, o0, A3 + lxvd2x vs25, o16, A3 + addi A3, A3, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + xxpermdi vs36, vs1, vs9, 0 + xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs38, vs1, vs9, 3 + xxpermdi vs39, vs17, vs25, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro COPY_4x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + lxvd2x vs16, o0, A2 + addi A2, A2, 16 + + + lxvd2x vs24, o0, A3 + addi A3, A3, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs34, vs0, vs8, 3 + xxpermdi vs35, vs16, vs24, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro COPY_4x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + lxsdx vs16, o0, A2 + addi A2, A2, 8 + + + lxsdx vs24, o0, A3 + addi A3, A3, 8 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs16, vs24, 0 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro COPY_2x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + lxvd2x vs12, o64, A1 + lxvd2x vs13, o80, A1 + lxvd2x vs14, o96, A1 + lxvd2x vs15, o112, A1 + addi A1, A1, 128 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + xxpermdi vs40, vs4, vs12, 0 + xxpermdi vs41, vs4, vs12, 3 + + xxpermdi vs42, vs5, vs13, 0 + xxpermdi vs43, vs5, vs13, 3 + + xxpermdi vs44, vs6, vs14, 0 + xxpermdi vs45, vs6, vs14, 3 + + xxpermdi vs46, vs7, vs15, 0 + xxpermdi vs47, vs7, vs15, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + stxvd2x vs40, o0, BO + stxvd2x vs41, o16, BO + stxvd2x vs42, o32, BO + stxvd2x vs43, o48, BO + stxvd2x vs44, o64, BO + stxvd2x vs45, o80, BO + stxvd2x vs46, o96, BO + stxvd2x vs47, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro COPY_2x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + lxvd2x vs10, o32, A1 + lxvd2x vs11, o48, A1 + addi A1, A1, 64 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + xxpermdi vs36, vs2, vs10, 0 + xxpermdi vs37, vs2, vs10, 3 + + xxpermdi vs38, vs3, vs11, 0 + xxpermdi vs39, vs3, vs11, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + stxvd2x vs36, o64, BO + stxvd2x vs37, o80, BO + stxvd2x vs38, o96, BO + stxvd2x vs39, o112, BO + addi BO, BO, 128 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro COPY_2x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + lxvd2x vs8, o0, A1 + lxvd2x vs9, o16, A1 + addi A1, A1, 32 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + xxpermdi vs34, vs1, vs9, 0 + xxpermdi vs35, vs1, vs9, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + stxvd2x vs34, o32, BO + stxvd2x vs35, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro COPY_2x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + lxvd2x vs8, o0, A1 + addi A1, A1, 16 + + + xxpermdi vs32, vs0, vs8, 0 + xxpermdi vs33, vs0, vs8, 3 + + + stxvd2x vs32, o0, BO + stxvd2x vs33, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro COPY_2x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + lxsdx vs8, o0, A1 + addi A1, A1, 8 + + + xxpermdi vs32, vs0, vs8, 0 + + + stxvd2x vs32, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro COPY_1x16 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + lxvd2x vs4, o64, A0 + lxvd2x vs5, o80, A0 + lxvd2x vs6, o96, A0 + lxvd2x vs7, o112, A0 + addi A0, A0, 128 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + stxvd2x vs4, o0, BO + stxvd2x vs5, o16, BO + stxvd2x vs6, o32, BO + stxvd2x vs7, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro COPY_1x8 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + lxvd2x vs2, o32, A0 + lxvd2x vs3, o48, A0 + addi A0, A0, 64 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + stxvd2x vs2, o32, BO + stxvd2x vs3, o48, BO + addi BO, BO, 64 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro COPY_1x4 + + lxvd2x vs0, o0, A0 + lxvd2x vs1, o16, A0 + addi A0, A0, 32 + + + stxvd2x vs0, o0, BO + stxvd2x vs1, o16, BO + addi BO, BO, 32 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro COPY_1x2 + + lxvd2x vs0, o0, A0 + addi A0, A0, 16 + + + stxvd2x vs0, o0, BO + addi BO, BO, 16 + + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro COPY_1x1 + + lxsdx vs0, o0, A0 + addi A0, A0, 8 + + + stxsdx vs0, o0, BO + addi BO, BO, 8 + + +.endm +