diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 0b6a7f3b8..fb07ccffd 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -21,12 +21,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = gemm_ncopy_4.S -DGEMMOTCOPY = gemm_tcopy_4.S -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 4c14b0c6f..bcc6ce328 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#define T4 r12 +#define T3 r11 + #define o8 r15 #define o24 r16 #define ALPHA r17 @@ -265,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi ALPHA, SP, 224 #endif - li PRE, 256 + li PRE, 384 li o8 , 8 li o16, 16 li o24, 24 diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 49c438f61..4ad3387e8 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -35,160 +35,154 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. srawi. J, N, 2 - ble .LDGEMM_L4_END + ble LDGEMM_L4_END -.LDGEMM_L4_BEGIN: +LDGEMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 - ble .LDGEMM_L4x16_END + ble LDGEMM_L4x16_END -.LDGEMM_L4x16_BEGIN: + .align 5 +LDGEMM_L4x16_BEGIN: + li T4, -128 + + and T1, CO, T4 + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + andi. cr0, CO, 127 + ble LDGEMM_L4x16_BEGIN_NOPRE + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + +LDGEMM_L4x16_BEGIN_NOPRE: mr BO, B - srawi. L, K, 3 - ble .LDGEMM_L4x16_SUB0 + srawi. L, K, 2 + ble LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x16_SUB4 + ble LDGEMM_L4x16_SUB4 -.LDGEMM_L4x16_LOOP_START: + .align 5 +LDGEMM_L4x16_LOOP_START: - dcbt AO, PRE + dcbt AO, PRE LOAD4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_I1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 addic. L, L, -2 - ble .LDGEMM_L4x16_LOOP_END + ble LDGEMM_L4x16_LOOP_END - .align 5 + .align 7 -.LDGEMM_L4x16_LOOP: +LDGEMM_L4x16_LOOP: - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 addic. L, L, -1 - bgt .LDGEMM_L4x16_LOOP + bgt LDGEMM_L4x16_LOOP -.LDGEMM_L4x16_LOOP_END: + .align 5 +LDGEMM_L4x16_LOOP_END: - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_1 - dcbt AO, PRE + dcbt AO, PRE KERNEL4x16_2 - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - - dcbt AO, PRE - KERNEL4x16_1 - dcbt AO, PRE - KERNEL4x16_2 - dcbt AO, PRE KERNEL4x16_1 KERNEL4x16_E2 - b .LDGEMM_L4x16_SUB1 + b LDGEMM_L4x16_SUB1 -.LDGEMM_L4x16_SUB4: +LDGEMM_L4x16_SUB4: - dcbt AO, PRE KERNEL4x16_SUBI1 - dcbt AO, PRE - KERNEL4x16_SUB1 - dcbt AO, PRE - KERNEL4x16_SUB1 - dcbt AO, PRE - KERNEL4x16_SUB1 - - KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b .LDGEMM_L4x16_SUB1 + b LDGEMM_L4x16_SUB1 -.LDGEMM_L4x16_SUB0: +LDGEMM_L4x16_SUB0: - andi. L, K, 7 + andi. L, K, 3 KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x16_SAVE - b .LDGEMM_L4x16_SUB2 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 -.LDGEMM_L4x16_SUB1: +LDGEMM_L4x16_SUB1: - andi. L, K, 7 - ble .LDGEMM_L4x16_SAVE + andi. L, K, 3 + ble LDGEMM_L4x16_SAVE -.LDGEMM_L4x16_SUB2: +LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x16_SUB2 + bgt LDGEMM_L4x16_SUB2 -.LDGEMM_L4x16_SAVE: + .align 5 +LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt .LDGEMM_L4x16_BEGIN + bgt LDGEMM_L4x16_BEGIN -.LDGEMM_L4x16_END: +LDGEMM_L4x16_END: -.LDGEMM_L4x8_BEGIN: +LDGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END andi. T1, M, 8 - ble .LDGEMM_L4x8_END + ble LDGEMM_L4x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x8_SUB0 + ble LDGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x8_SUB4 + ble LDGEMM_L4x8_SUB4 -.LDGEMM_L4x8_LOOP_START: +LDGEMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -202,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LDGEMM_L4x8_LOOP_END + ble LDGEMM_L4x8_LOOP_END .align 5 -.LDGEMM_L4x8_LOOP: +LDGEMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -219,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LDGEMM_L4x8_LOOP + bgt LDGEMM_L4x8_LOOP -.LDGEMM_L4x8_LOOP_END: +LDGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -233,9 +227,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB4: +LDGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -247,48 +241,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LDGEMM_L4x8_SUB1 + b LDGEMM_L4x8_SUB1 -.LDGEMM_L4x8_SUB0: +LDGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x8_SAVE - b .LDGEMM_L4x8_SUB2 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SUB1: +LDGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x8_SAVE + ble LDGEMM_L4x8_SAVE -.LDGEMM_L4x8_SUB2: +LDGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x8_SUB2 + bgt LDGEMM_L4x8_SUB2 -.LDGEMM_L4x8_SAVE: +LDGEMM_L4x8_SAVE: SAVE4x8 -.LDGEMM_L4x8_END: +LDGEMM_L4x8_END: -.LDGEMM_L4x4_BEGIN: +LDGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L4x4_END + ble LDGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x4_SUB0 + ble LDGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x4_SUB4 + ble LDGEMM_L4x4_SUB4 -.LDGEMM_L4x4_LOOP_START: +LDGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -302,11 +296,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LDGEMM_L4x4_LOOP_END + ble LDGEMM_L4x4_LOOP_END .align 5 -.LDGEMM_L4x4_LOOP: +LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -319,9 +313,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LDGEMM_L4x4_LOOP + bgt LDGEMM_L4x4_LOOP -.LDGEMM_L4x4_LOOP_END: +LDGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -333,9 +327,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB4: +LDGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -347,48 +341,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LDGEMM_L4x4_SUB1 + b LDGEMM_L4x4_SUB1 -.LDGEMM_L4x4_SUB0: +LDGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x4_SAVE - b .LDGEMM_L4x4_SUB2 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SUB1: +LDGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x4_SAVE + ble LDGEMM_L4x4_SAVE -.LDGEMM_L4x4_SUB2: +LDGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x4_SUB2 + bgt LDGEMM_L4x4_SUB2 -.LDGEMM_L4x4_SAVE: +LDGEMM_L4x4_SAVE: SAVE4x4 -.LDGEMM_L4x4_END: +LDGEMM_L4x4_END: -.LDGEMM_L4x2_BEGIN: +LDGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L4x2_END + ble LDGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x2_SUB0 + ble LDGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x2_SUB4 + ble LDGEMM_L4x2_SUB4 -.LDGEMM_L4x2_LOOP_START: +LDGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -402,11 +396,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LDGEMM_L4x2_LOOP_END + ble LDGEMM_L4x2_LOOP_END .align 5 -.LDGEMM_L4x2_LOOP: +LDGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -419,9 +413,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LDGEMM_L4x2_LOOP + bgt LDGEMM_L4x2_LOOP -.LDGEMM_L4x2_LOOP_END: +LDGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -433,9 +427,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB4: +LDGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -447,48 +441,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LDGEMM_L4x2_SUB1 + b LDGEMM_L4x2_SUB1 -.LDGEMM_L4x2_SUB0: +LDGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x2_SAVE - b .LDGEMM_L4x2_SUB2 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SUB1: +LDGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x2_SAVE + ble LDGEMM_L4x2_SAVE -.LDGEMM_L4x2_SUB2: +LDGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x2_SUB2 + bgt LDGEMM_L4x2_SUB2 -.LDGEMM_L4x2_SAVE: +LDGEMM_L4x2_SAVE: SAVE4x2 -.LDGEMM_L4x2_END: +LDGEMM_L4x2_END: -.LDGEMM_L4x1_BEGIN: +LDGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L4x1_END + ble LDGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L4x1_SUB0 + ble LDGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L4x1_SUB4 + ble LDGEMM_L4x1_SUB4 -.LDGEMM_L4x1_LOOP_START: +LDGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -502,11 +496,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LDGEMM_L4x1_LOOP_END + ble LDGEMM_L4x1_LOOP_END .align 5 -.LDGEMM_L4x1_LOOP: +LDGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -519,9 +513,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LDGEMM_L4x1_LOOP + bgt LDGEMM_L4x1_LOOP -.LDGEMM_L4x1_LOOP_END: +LDGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -533,9 +527,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB4: +LDGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -547,74 +541,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LDGEMM_L4x1_SUB1 + b LDGEMM_L4x1_SUB1 -.LDGEMM_L4x1_SUB0: +LDGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L4x1_SAVE - b .LDGEMM_L4x1_SUB2 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SUB1: +LDGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L4x1_SAVE + ble LDGEMM_L4x1_SAVE -.LDGEMM_L4x1_SUB2: +LDGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L4x1_SUB2 + bgt LDGEMM_L4x1_SUB2 -.LDGEMM_L4x1_SAVE: +LDGEMM_L4x1_SAVE: SAVE4x1 -.LDGEMM_L4x1_END: +LDGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LDGEMM_L4_BEGIN + bgt LDGEMM_L4_BEGIN andi. T2, N, 3 ble .L999 -.LDGEMM_L4_END: +LDGEMM_L4_END: - b .LDGEMM_L2_BEGIN + b LDGEMM_L2_BEGIN .L999_H1: b .L999 -.LDGEMM_L2_BEGIN: +LDGEMM_L2_BEGIN: andi. T1, N, 2 - ble .LDGEMM_L2_END + ble LDGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble .LDGEMM_L2x16_END + ble LDGEMM_L2x16_END -.LDGEMM_L2x16_BEGIN: +LDGEMM_L2x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x16_SUB0 + ble LDGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x16_SUB4 + ble LDGEMM_L2x16_SUB4 -.LDGEMM_L2x16_LOOP_START: +LDGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -637,11 +631,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LDGEMM_L2x16_LOOP_END + ble LDGEMM_L2x16_LOOP_END .align 5 -.LDGEMM_L2x16_LOOP: +LDGEMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -662,9 +656,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LDGEMM_L2x16_LOOP + bgt LDGEMM_L2x16_LOOP -.LDGEMM_L2x16_LOOP_END: +LDGEMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -683,9 +677,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB4: +LDGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -701,53 +695,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LDGEMM_L2x16_SUB1 + b LDGEMM_L2x16_SUB1 -.LDGEMM_L2x16_SUB0: +LDGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x16_SAVE - b .LDGEMM_L2x16_SUB2 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SUB1: +LDGEMM_L2x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x16_SAVE + ble LDGEMM_L2x16_SAVE -.LDGEMM_L2x16_SUB2: +LDGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x16_SUB2 + bgt LDGEMM_L2x16_SUB2 -.LDGEMM_L2x16_SAVE: +LDGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt .LDGEMM_L2x16_BEGIN + bgt LDGEMM_L2x16_BEGIN -.LDGEMM_L2x16_END: +LDGEMM_L2x16_END: -.LDGEMM_L2x8_BEGIN: +LDGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END andi. T1, M, 8 - ble .LDGEMM_L2x8_END + ble LDGEMM_L2x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x8_SUB0 + ble LDGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x8_SUB4 + ble LDGEMM_L2x8_SUB4 -.LDGEMM_L2x8_LOOP_START: +LDGEMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -761,11 +755,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LDGEMM_L2x8_LOOP_END + ble LDGEMM_L2x8_LOOP_END .align 5 -.LDGEMM_L2x8_LOOP: +LDGEMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -778,9 +772,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LDGEMM_L2x8_LOOP + bgt LDGEMM_L2x8_LOOP -.LDGEMM_L2x8_LOOP_END: +LDGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -792,9 +786,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB4: +LDGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -806,48 +800,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LDGEMM_L2x8_SUB1 + b LDGEMM_L2x8_SUB1 -.LDGEMM_L2x8_SUB0: +LDGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x8_SAVE - b .LDGEMM_L2x8_SUB2 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SUB1: +LDGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x8_SAVE + ble LDGEMM_L2x8_SAVE -.LDGEMM_L2x8_SUB2: +LDGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x8_SUB2 + bgt LDGEMM_L2x8_SUB2 -.LDGEMM_L2x8_SAVE: +LDGEMM_L2x8_SAVE: SAVE2x8 -.LDGEMM_L2x8_END: +LDGEMM_L2x8_END: -.LDGEMM_L2x4_BEGIN: +LDGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L2x4_END + ble LDGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x4_SUB0 + ble LDGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x4_SUB4 + ble LDGEMM_L2x4_SUB4 -.LDGEMM_L2x4_LOOP_START: +LDGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -861,11 +855,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LDGEMM_L2x4_LOOP_END + ble LDGEMM_L2x4_LOOP_END .align 5 -.LDGEMM_L2x4_LOOP: +LDGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -878,9 +872,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LDGEMM_L2x4_LOOP + bgt LDGEMM_L2x4_LOOP -.LDGEMM_L2x4_LOOP_END: +LDGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -892,9 +886,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB4: +LDGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -906,48 +900,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LDGEMM_L2x4_SUB1 + b LDGEMM_L2x4_SUB1 -.LDGEMM_L2x4_SUB0: +LDGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x4_SAVE - b .LDGEMM_L2x4_SUB2 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SUB1: +LDGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x4_SAVE + ble LDGEMM_L2x4_SAVE -.LDGEMM_L2x4_SUB2: +LDGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x4_SUB2 + bgt LDGEMM_L2x4_SUB2 -.LDGEMM_L2x4_SAVE: +LDGEMM_L2x4_SAVE: SAVE2x4 -.LDGEMM_L2x4_END: +LDGEMM_L2x4_END: -.LDGEMM_L2x2_BEGIN: +LDGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L2x2_END + ble LDGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x2_SUB0 + ble LDGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x2_SUB4 + ble LDGEMM_L2x2_SUB4 -.LDGEMM_L2x2_LOOP_START: +LDGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -961,11 +955,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LDGEMM_L2x2_LOOP_END + ble LDGEMM_L2x2_LOOP_END .align 5 -.LDGEMM_L2x2_LOOP: +LDGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -978,9 +972,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LDGEMM_L2x2_LOOP + bgt LDGEMM_L2x2_LOOP -.LDGEMM_L2x2_LOOP_END: +LDGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -992,9 +986,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB4: +LDGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1006,48 +1000,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LDGEMM_L2x2_SUB1 + b LDGEMM_L2x2_SUB1 -.LDGEMM_L2x2_SUB0: +LDGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x2_SAVE - b .LDGEMM_L2x2_SUB2 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SUB1: +LDGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x2_SAVE + ble LDGEMM_L2x2_SAVE -.LDGEMM_L2x2_SUB2: +LDGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x2_SUB2 + bgt LDGEMM_L2x2_SUB2 -.LDGEMM_L2x2_SAVE: +LDGEMM_L2x2_SAVE: SAVE2x2 -.LDGEMM_L2x2_END: +LDGEMM_L2x2_END: -.LDGEMM_L2x1_BEGIN: +LDGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L2x1_END + ble LDGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L2x1_SUB0 + ble LDGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L2x1_SUB4 + ble LDGEMM_L2x1_SUB4 -.LDGEMM_L2x1_LOOP_START: +LDGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1061,11 +1055,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LDGEMM_L2x1_LOOP_END + ble LDGEMM_L2x1_LOOP_END .align 5 -.LDGEMM_L2x1_LOOP: +LDGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1078,9 +1072,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LDGEMM_L2x1_LOOP + bgt LDGEMM_L2x1_LOOP -.LDGEMM_L2x1_LOOP_END: +LDGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1092,9 +1086,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB4: +LDGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1106,59 +1100,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LDGEMM_L2x1_SUB1 + b LDGEMM_L2x1_SUB1 -.LDGEMM_L2x1_SUB0: +LDGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L2x1_SAVE - b .LDGEMM_L2x1_SUB2 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SUB1: +LDGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L2x1_SAVE + ble LDGEMM_L2x1_SAVE -.LDGEMM_L2x1_SUB2: +LDGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L2x1_SUB2 + bgt LDGEMM_L2x1_SUB2 -.LDGEMM_L2x1_SAVE: +LDGEMM_L2x1_SAVE: SAVE2x1 -.LDGEMM_L2x1_END: +LDGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -.LDGEMM_L2_END: -.LDGEMM_L1_BEGIN: +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LDGEMM_L1_END + ble LDGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble .LDGEMM_L1x16_END + ble LDGEMM_L1x16_END -.LDGEMM_L1x16_BEGIN: +LDGEMM_L1x16_BEGIN: mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x16_SUB0 + ble LDGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x16_SUB4 + ble LDGEMM_L1x16_SUB4 -.LDGEMM_L1x16_LOOP_START: +LDGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1181,11 +1175,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LDGEMM_L1x16_LOOP_END + ble LDGEMM_L1x16_LOOP_END .align 5 -.LDGEMM_L1x16_LOOP: +LDGEMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -1206,9 +1200,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LDGEMM_L1x16_LOOP + bgt LDGEMM_L1x16_LOOP -.LDGEMM_L1x16_LOOP_END: +LDGEMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -1227,9 +1221,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB4: +LDGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1245,53 +1239,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LDGEMM_L1x16_SUB1 + b LDGEMM_L1x16_SUB1 -.LDGEMM_L1x16_SUB0: +LDGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x16_SAVE - b .LDGEMM_L1x16_SUB2 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SUB1: +LDGEMM_L1x16_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x16_SAVE + ble LDGEMM_L1x16_SAVE -.LDGEMM_L1x16_SUB2: +LDGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x16_SUB2 + bgt LDGEMM_L1x16_SUB2 -.LDGEMM_L1x16_SAVE: +LDGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt .LDGEMM_L1x16_BEGIN + bgt LDGEMM_L1x16_BEGIN -.LDGEMM_L1x16_END: +LDGEMM_L1x16_END: -.LDGEMM_L1x8_BEGIN: +LDGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END andi. T1, M, 8 - ble .LDGEMM_L1x8_END + ble LDGEMM_L1x8_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x8_SUB0 + ble LDGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x8_SUB4 + ble LDGEMM_L1x8_SUB4 -.LDGEMM_L1x8_LOOP_START: +LDGEMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1305,11 +1299,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LDGEMM_L1x8_LOOP_END + ble LDGEMM_L1x8_LOOP_END .align 5 -.LDGEMM_L1x8_LOOP: +LDGEMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1322,9 +1316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LDGEMM_L1x8_LOOP + bgt LDGEMM_L1x8_LOOP -.LDGEMM_L1x8_LOOP_END: +LDGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1336,9 +1330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB4: +LDGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1350,48 +1344,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LDGEMM_L1x8_SUB1 + b LDGEMM_L1x8_SUB1 -.LDGEMM_L1x8_SUB0: +LDGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x8_SAVE - b .LDGEMM_L1x8_SUB2 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SUB1: +LDGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x8_SAVE + ble LDGEMM_L1x8_SAVE -.LDGEMM_L1x8_SUB2: +LDGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x8_SUB2 + bgt LDGEMM_L1x8_SUB2 -.LDGEMM_L1x8_SAVE: +LDGEMM_L1x8_SAVE: SAVE1x8 -.LDGEMM_L1x8_END: +LDGEMM_L1x8_END: -.LDGEMM_L1x4_BEGIN: +LDGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LDGEMM_L1x4_END + ble LDGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x4_SUB0 + ble LDGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x4_SUB4 + ble LDGEMM_L1x4_SUB4 -.LDGEMM_L1x4_LOOP_START: +LDGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1405,11 +1399,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LDGEMM_L1x4_LOOP_END + ble LDGEMM_L1x4_LOOP_END .align 5 -.LDGEMM_L1x4_LOOP: +LDGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1422,9 +1416,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LDGEMM_L1x4_LOOP + bgt LDGEMM_L1x4_LOOP -.LDGEMM_L1x4_LOOP_END: +LDGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1436,9 +1430,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB4: +LDGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1450,48 +1444,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LDGEMM_L1x4_SUB1 + b LDGEMM_L1x4_SUB1 -.LDGEMM_L1x4_SUB0: +LDGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x4_SAVE - b .LDGEMM_L1x4_SUB2 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SUB1: +LDGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x4_SAVE + ble LDGEMM_L1x4_SAVE -.LDGEMM_L1x4_SUB2: +LDGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x4_SUB2 + bgt LDGEMM_L1x4_SUB2 -.LDGEMM_L1x4_SAVE: +LDGEMM_L1x4_SAVE: SAVE1x4 -.LDGEMM_L1x4_END: +LDGEMM_L1x4_END: -.LDGEMM_L1x2_BEGIN: +LDGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LDGEMM_L1x2_END + ble LDGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x2_SUB0 + ble LDGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x2_SUB4 + ble LDGEMM_L1x2_SUB4 -.LDGEMM_L1x2_LOOP_START: +LDGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1505,11 +1499,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LDGEMM_L1x2_LOOP_END + ble LDGEMM_L1x2_LOOP_END .align 5 -.LDGEMM_L1x2_LOOP: +LDGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1522,9 +1516,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LDGEMM_L1x2_LOOP + bgt LDGEMM_L1x2_LOOP -.LDGEMM_L1x2_LOOP_END: +LDGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1536,9 +1530,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB4: +LDGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1550,48 +1544,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LDGEMM_L1x2_SUB1 + b LDGEMM_L1x2_SUB1 -.LDGEMM_L1x2_SUB0: +LDGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x2_SAVE - b .LDGEMM_L1x2_SUB2 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SUB1: +LDGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x2_SAVE + ble LDGEMM_L1x2_SAVE -.LDGEMM_L1x2_SUB2: +LDGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x2_SUB2 + bgt LDGEMM_L1x2_SUB2 -.LDGEMM_L1x2_SAVE: +LDGEMM_L1x2_SAVE: SAVE1x2 -.LDGEMM_L1x2_END: +LDGEMM_L1x2_END: -.LDGEMM_L1x1_BEGIN: +LDGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LDGEMM_L1x1_END + ble LDGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble .LDGEMM_L1x1_SUB0 + ble LDGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LDGEMM_L1x1_SUB4 + ble LDGEMM_L1x1_SUB4 -.LDGEMM_L1x1_LOOP_START: +LDGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1605,11 +1599,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LDGEMM_L1x1_LOOP_END + ble LDGEMM_L1x1_LOOP_END .align 5 -.LDGEMM_L1x1_LOOP: +LDGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1622,9 +1616,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LDGEMM_L1x1_LOOP + bgt LDGEMM_L1x1_LOOP -.LDGEMM_L1x1_LOOP_END: +LDGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1636,9 +1630,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB4: +LDGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1650,34 +1644,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LDGEMM_L1x1_SUB1 + b LDGEMM_L1x1_SUB1 -.LDGEMM_L1x1_SUB0: +LDGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LDGEMM_L1x1_SAVE - b .LDGEMM_L1x1_SUB2 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SUB1: +LDGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LDGEMM_L1x1_SAVE + ble LDGEMM_L1x1_SAVE -.LDGEMM_L1x1_SUB2: +LDGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LDGEMM_L1x1_SUB2 + bgt LDGEMM_L1x1_SUB2 -.LDGEMM_L1x1_SAVE: +LDGEMM_L1x1_SAVE: SAVE1x1 -.LDGEMM_L1x1_END: +LDGEMM_L1x1_END: -.LDGEMM_L1_END: +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 27c05e08e..36531fbe9 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -431,6 +431,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, CO addi T2, T1, 64 + add T3, T1, LDC + addi T4, T3, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 @@ -442,6 +444,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 + + lxvd2x vs8, 0, T3 + lxvd2x vs9, o16, T3 + lxvd2x vs10, o32, T3 + lxvd2x vs11, o48, T3 + + lxvd2x vs12, 0, T4 + lxvd2x vs13, o16, T4 + lxvd2x vs14, o32, T4 + lxvd2x vs15, o48, T4 #endif #ifndef TRMMKERNEL @@ -453,45 +465,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - dcbt T1, PRE - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r @@ -501,6 +474,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r #else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r @@ -511,20 +492,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs15, vs47, alpha_r #endif - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 - dcbt T1, PRE + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 + stxvd2x vs8, 0, T3 + stxvd2x vs9, o16, T3 + stxvd2x vs10, o32, T3 + stxvd2x vs11, o48, T3 - add T1, T1, LDC - add T2, T2, LDC + stxvd2x vs12, 0, T4 + stxvd2x vs13, o16, T4 + stxvd2x vs14, o32, T4 + stxvd2x vs15, o48, T4 + + slwi T4, LDC, 1 + add T1, T1, T4 + add T3, T3, T4 + addi T2, T1, 64 + addi T4, T3, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 @@ -536,6 +528,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 + + lxvd2x vs8, 0, T3 + lxvd2x vs9, o16, T3 + lxvd2x vs10, o32, T3 + lxvd2x vs11, o48, T3 + + lxvd2x vs12, 0, T4 + lxvd2x vs13, o16, T4 + lxvd2x vs14, o32, T4 + lxvd2x vs15, o48, T4 #endif #ifndef TRMMKERNEL @@ -547,45 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs5, vs53, alpha_r xvmaddadp vs6, vs54, alpha_r xvmaddadp vs7, vs55, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r - xvmuldp vs2, vs50, alpha_r - xvmuldp vs3, vs51, alpha_r - xvmuldp vs4, vs52, alpha_r - xvmuldp vs5, vs53, alpha_r - xvmuldp vs6, vs54, alpha_r - xvmuldp vs7, vs55, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - dcbt T1, PRE - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r xvmaddadp vs9, vs57, alpha_r xvmaddadp vs10, vs58, alpha_r @@ -595,6 +558,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs14, vs62, alpha_r xvmaddadp vs15, vs63, alpha_r #else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r + xvmuldp vs4, vs52, alpha_r + xvmuldp vs5, vs53, alpha_r + xvmuldp vs6, vs54, alpha_r + xvmuldp vs7, vs55, alpha_r xvmuldp vs8, vs56, alpha_r xvmuldp vs9, vs57, alpha_r xvmuldp vs10, vs58, alpha_r @@ -605,17 +576,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs15, vs63, alpha_r #endif - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 - dcbt T1, PRE + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 + stxvd2x vs8, 0, T3 + stxvd2x vs9, o16, T3 + stxvd2x vs10, o32, T3 + stxvd2x vs11, o48, T3 + + stxvd2x vs12, 0, T4 + stxvd2x vs13, o16, T4 + stxvd2x vs14, o32, T4 + stxvd2x vs15, o48, T4 addi CO, CO, 128 diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index f87af535d..eca78bac4 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add B2, B2, B add B1, B1, B - li PREA, 768 + li PREA, 256 addi PREB, M16, 128 li o8, 8 diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S index 776cd3401..28fc74793 100644 --- a/kernel/power/dgemm_tcopy_logic_16_power8.S +++ b/kernel/power/dgemm_tcopy_logic_16_power8.S @@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN: DCOPYT_L4x16_LOOP: +/* addi T1, PREB, 128 addi T2, PREB, 256 +*/ dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA +/* dcbtst BO, M16 dcbtst BO, PREB dcbtst BO, T1 dcbtst BO, T2 +*/ COPY_4x16 add BO, BO, M16 diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 2294128a2..e9dbd991e 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PRE r30 #define T2 r31 -#include "dgemm_macros_16x4_power8.S" +#include "dtrmm_macros_16x4_power8.S" #ifndef NEEDPARAM diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S new file mode 100644 index 000000000..079144a90 --- /dev/null +++ b/kernel/power/dtrmm_macros_16x4_power8.S @@ -0,0 +1,3431 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ + +.macro LOAD4x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + addi AO, AO, 64 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + xvmaddadp vs60, vs12, vs31 + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + xvmaddadp vs63, vs15, vs31 + +.endm + +.macro KERNEL4x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.endm + +.macro KERNEL4x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endm + +.macro SAVE4x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r + xvmaddadp vs4, vs52, alpha_r + xvmaddadp vs5, vs53, alpha_r + xvmaddadp vs6, vs54, alpha_r + xvmaddadp vs7, vs55, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r + xvmuldp vs4, vs52, alpha_r + xvmuldp vs5, vs53, alpha_r + xvmuldp vs6, vs54, alpha_r + xvmuldp vs7, vs55, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r + xvmaddadp vs12, vs60, alpha_r + xvmaddadp vs13, vs61, alpha_r + xvmaddadp vs14, vs62, alpha_r + xvmaddadp vs15, vs63, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r + xvmuldp vs12, vs60, alpha_r + xvmuldp vs13, vs61, alpha_r + xvmuldp vs14, vs62, alpha_r + xvmuldp vs15, vs63, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_1 + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_2 + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + +.endm + +.macro KERNEL4x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.endm + +.macro KERNEL4x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 64 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r + xvmaddadp vs2, vs50, alpha_r + xvmaddadp vs3, vs51, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r + xvmuldp vs2, vs50, alpha_r + xvmuldp vs3, vs51, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r + xvmaddadp vs10, vs58, alpha_r + xvmaddadp vs11, vs59, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r + xvmuldp vs10, vs58, alpha_r + xvmuldp vs11, vs59, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + diff --git a/param.h b/param.h index aa09f6d61..e693755fa 100644 --- a/param.h +++ b/param.h @@ -410,7 +410,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(STEAMROLLER) || defined(EXCAVATOR) +#ifdef STEAMROLLER +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + + + +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 +#endif + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 576 +#define ZGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 576 +#else +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#endif +#define QGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 160 +#define ZGEMM_DEFAULT_Q 160 +#define CGEMM_DEFAULT_Q 160 +#else +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#endif +#define QGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#define SGEMM_DEFAULT_R 12288 +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R 12288 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + + +#ifdef EXCAVATOR #define SNUMOPT 8 #define DNUMOPT 4 @@ -1885,12 +1978,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 1280 -#define DGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 768 #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 320 #define SGEMM_DEFAULT_Q 640 -#define DGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 768 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640