diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index 031f342ad..c2dc1f651 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -128,17 +128,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 +#define alpha_vr vs31 #define o0 0 -#define TBUFFER r14 +#define BBUFFER r14 #define o4 r15 #define o12 r16 #define o8 r17 #define L r18 #define T1 r19 #define KK r20 -#define BB r21 +#define BBO r21 #define I r22 #define J r23 #define AO r24 @@ -256,11 +257,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 li PRE, 256 li o4 , 4 @@ -269,18 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o16, 16 li o32, 32 li o48, 48 - addi TBUFFER, SP, 320 + + li T1, 256 + slwi T1, T1, 9 // 131072 + sub BBUFFER, A, T1 // temp buffer for B unrolled addi T1, SP, 300 - stfs f1, 0(T1) + stxsspx f1, o0 , T1 + stxsspx f1, o4 , T1 + stxsspx f1, o8 , T1 + stxsspx f1, o12 , T1 - lxsspx alpha_r, 0, T1 + lxsspx alpha_r, o0, T1 + lxvw4x alpha_vr, o0, T1 #include "sgemm_logic_16x8_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S index 0ae6413ce..06bb79ea3 100644 --- a/kernel/power/sgemm_logic_16x8_power8.S +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -26,94 +26,149 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 3 - ble .LSGEMM_L8_END + ble SGEMM_L8_END -.LSGEMM_L8_BEGIN: +SGEMM_L8_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 3 + +SGEMM_L8_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L8_COPYB mr CO, C mr AO, A slwi T1, LDC , 3 add C, C, T1 srawi. I, M, 4 - ble .LSGEMM_L8x16_END + ble SGEMM_L8x16_END -.LSGEMM_L8x16_BEGIN: +SGEMM_L8x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x16_SUB0 + ble SGEMM_L8x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x16_SUB4 + ble SGEMM_L8x16_SUB4 -.LSGEMM_L8x16_LOOP_START: +SGEMM_L8x16_LOOP_START: dcbt AO, PRE + dcbt BO, PRE LOAD8x16_1 + dcbt BO, PRE KERNEL8x16_I1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 addic. L, L, -2 - ble .LSGEMM_L8x16_LOOP_END + ble SGEMM_L8x16_LOOP_END .align 5 -.LSGEMM_L8x16_LOOP: +SGEMM_L8x16_LOOP: + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 addic. L, L, -1 - bgt .LSGEMM_L8x16_LOOP + bgt SGEMM_L8x16_LOOP -.LSGEMM_L8x16_LOOP_END: +SGEMM_L8x16_LOOP_END: + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 KERNEL8x16_1 KERNEL8x16_E2 - b .LSGEMM_L8x16_SUB1 + b SGEMM_L8x16_SUB1 -.LSGEMM_L8x16_SUB4: +SGEMM_L8x16_SUB4: dcbt AO, PRE KERNEL8x16_SUBI1 @@ -127,53 +182,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_SUB1 KERNEL8x16_SUB1 - b .LSGEMM_L8x16_SUB1 + b SGEMM_L8x16_SUB1 -.LSGEMM_L8x16_SUB0: +SGEMM_L8x16_SUB0: andi. L, K, 7 KERNEL8x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x16_SAVE - b .LSGEMM_L8x16_SUB2 + ble SGEMM_L8x16_SAVE + b SGEMM_L8x16_SUB2 -.LSGEMM_L8x16_SUB1: +SGEMM_L8x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x16_SAVE + ble SGEMM_L8x16_SAVE -.LSGEMM_L8x16_SUB2: +SGEMM_L8x16_SUB2: KERNEL8x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x16_SUB2 + bgt SGEMM_L8x16_SUB2 -.LSGEMM_L8x16_SAVE: +SGEMM_L8x16_SAVE: SAVE8x16 addic. I, I, -1 - bgt .LSGEMM_L8x16_BEGIN + bgt SGEMM_L8x16_BEGIN -.LSGEMM_L8x16_END: +SGEMM_L8x16_END: -.LSGEMM_L8x8_BEGIN: +SGEMM_L8x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L8x1_END + ble SGEMM_L8x1_END andi. T1, M, 8 - ble .LSGEMM_L8x8_END - mr BO, B + ble SGEMM_L8x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x8_SUB0 + ble SGEMM_L8x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x8_SUB4 + ble SGEMM_L8x8_SUB4 -.LSGEMM_L8x8_LOOP_START: +SGEMM_L8x8_LOOP_START: LOAD8x8_1 KERNEL8x8_I1 @@ -187,11 +242,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -2 - ble .LSGEMM_L8x8_LOOP_END + ble SGEMM_L8x8_LOOP_END .align 5 -.LSGEMM_L8x8_LOOP: +SGEMM_L8x8_LOOP: KERNEL8x8_1 KERNEL8x8_2 @@ -204,9 +259,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -1 - bgt .LSGEMM_L8x8_LOOP + bgt SGEMM_L8x8_LOOP -.LSGEMM_L8x8_LOOP_END: +SGEMM_L8x8_LOOP_END: KERNEL8x8_1 KERNEL8x8_2 @@ -218,9 +273,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_1 KERNEL8x8_E2 - b .LSGEMM_L8x8_SUB1 + b SGEMM_L8x8_SUB1 -.LSGEMM_L8x8_SUB4: +SGEMM_L8x8_SUB4: KERNEL8x8_SUBI1 KERNEL8x8_SUB1 @@ -232,48 +287,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_SUB1 KERNEL8x8_SUB1 - b .LSGEMM_L8x8_SUB1 + b SGEMM_L8x8_SUB1 -.LSGEMM_L8x8_SUB0: +SGEMM_L8x8_SUB0: andi. L, K, 7 KERNEL8x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x8_SAVE - b .LSGEMM_L8x8_SUB2 + ble SGEMM_L8x8_SAVE + b SGEMM_L8x8_SUB2 -.LSGEMM_L8x8_SUB1: +SGEMM_L8x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x8_SAVE + ble SGEMM_L8x8_SAVE -.LSGEMM_L8x8_SUB2: +SGEMM_L8x8_SUB2: KERNEL8x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x8_SUB2 + bgt SGEMM_L8x8_SUB2 -.LSGEMM_L8x8_SAVE: +SGEMM_L8x8_SAVE: SAVE8x8 -.LSGEMM_L8x8_END: +SGEMM_L8x8_END: -.LSGEMM_L8x4_BEGIN: +SGEMM_L8x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L8x4_END - mr BO, B + ble SGEMM_L8x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x4_SUB0 + ble SGEMM_L8x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x4_SUB4 + ble SGEMM_L8x4_SUB4 -.LSGEMM_L8x4_LOOP_START: +SGEMM_L8x4_LOOP_START: LOAD8x4_1 KERNEL8x4_I1 @@ -287,11 +342,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -2 - ble .LSGEMM_L8x4_LOOP_END + ble SGEMM_L8x4_LOOP_END .align 5 -.LSGEMM_L8x4_LOOP: +SGEMM_L8x4_LOOP: KERNEL8x4_1 KERNEL8x4_2 @@ -304,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -1 - bgt .LSGEMM_L8x4_LOOP + bgt SGEMM_L8x4_LOOP -.LSGEMM_L8x4_LOOP_END: +SGEMM_L8x4_LOOP_END: KERNEL8x4_1 KERNEL8x4_2 @@ -318,9 +373,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_1 KERNEL8x4_E2 - b .LSGEMM_L8x4_SUB1 + b SGEMM_L8x4_SUB1 -.LSGEMM_L8x4_SUB4: +SGEMM_L8x4_SUB4: KERNEL8x4_SUBI1 KERNEL8x4_SUB1 @@ -332,48 +387,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_SUB1 KERNEL8x4_SUB1 - b .LSGEMM_L8x4_SUB1 + b SGEMM_L8x4_SUB1 -.LSGEMM_L8x4_SUB0: +SGEMM_L8x4_SUB0: andi. L, K, 7 KERNEL8x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x4_SAVE - b .LSGEMM_L8x4_SUB2 + ble SGEMM_L8x4_SAVE + b SGEMM_L8x4_SUB2 -.LSGEMM_L8x4_SUB1: +SGEMM_L8x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x4_SAVE + ble SGEMM_L8x4_SAVE -.LSGEMM_L8x4_SUB2: +SGEMM_L8x4_SUB2: KERNEL8x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x4_SUB2 + bgt SGEMM_L8x4_SUB2 -.LSGEMM_L8x4_SAVE: +SGEMM_L8x4_SAVE: SAVE8x4 -.LSGEMM_L8x4_END: +SGEMM_L8x4_END: -.LSGEMM_L8x2_BEGIN: +SGEMM_L8x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L8x2_END - mr BO, B + ble SGEMM_L8x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x2_SUB0 + ble SGEMM_L8x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x2_SUB4 + ble SGEMM_L8x2_SUB4 -.LSGEMM_L8x2_LOOP_START: +SGEMM_L8x2_LOOP_START: LOAD8x2_1 KERNEL8x2_I1 @@ -387,11 +442,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -2 - ble .LSGEMM_L8x2_LOOP_END + ble SGEMM_L8x2_LOOP_END .align 5 -.LSGEMM_L8x2_LOOP: +SGEMM_L8x2_LOOP: KERNEL8x2_1 KERNEL8x2_2 @@ -404,9 +459,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -1 - bgt .LSGEMM_L8x2_LOOP + bgt SGEMM_L8x2_LOOP -.LSGEMM_L8x2_LOOP_END: +SGEMM_L8x2_LOOP_END: KERNEL8x2_1 KERNEL8x2_2 @@ -418,9 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_1 KERNEL8x2_E2 - b .LSGEMM_L8x2_SUB1 + b SGEMM_L8x2_SUB1 -.LSGEMM_L8x2_SUB4: +SGEMM_L8x2_SUB4: KERNEL8x2_SUBI1 KERNEL8x2_SUB1 @@ -432,48 +487,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_SUB1 KERNEL8x2_SUB1 - b .LSGEMM_L8x2_SUB1 + b SGEMM_L8x2_SUB1 -.LSGEMM_L8x2_SUB0: +SGEMM_L8x2_SUB0: andi. L, K, 7 KERNEL8x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x2_SAVE - b .LSGEMM_L8x2_SUB2 + ble SGEMM_L8x2_SAVE + b SGEMM_L8x2_SUB2 -.LSGEMM_L8x2_SUB1: +SGEMM_L8x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x2_SAVE + ble SGEMM_L8x2_SAVE -.LSGEMM_L8x2_SUB2: +SGEMM_L8x2_SUB2: KERNEL8x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x2_SUB2 + bgt SGEMM_L8x2_SUB2 -.LSGEMM_L8x2_SAVE: +SGEMM_L8x2_SAVE: SAVE8x2 -.LSGEMM_L8x2_END: +SGEMM_L8x2_END: -.LSGEMM_L8x1_BEGIN: +SGEMM_L8x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L8x1_END - mr BO, B + ble SGEMM_L8x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x1_SUB0 + ble SGEMM_L8x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x1_SUB4 + ble SGEMM_L8x1_SUB4 -.LSGEMM_L8x1_LOOP_START: +SGEMM_L8x1_LOOP_START: LOAD8x1_1 KERNEL8x1_I1 @@ -487,11 +542,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -2 - ble .LSGEMM_L8x1_LOOP_END + ble SGEMM_L8x1_LOOP_END .align 5 -.LSGEMM_L8x1_LOOP: +SGEMM_L8x1_LOOP: KERNEL8x1_1 KERNEL8x1_2 @@ -504,9 +559,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -1 - bgt .LSGEMM_L8x1_LOOP + bgt SGEMM_L8x1_LOOP -.LSGEMM_L8x1_LOOP_END: +SGEMM_L8x1_LOOP_END: KERNEL8x1_1 KERNEL8x1_2 @@ -518,9 +573,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_1 KERNEL8x1_E2 - b .LSGEMM_L8x1_SUB1 + b SGEMM_L8x1_SUB1 -.LSGEMM_L8x1_SUB4: +SGEMM_L8x1_SUB4: KERNEL8x1_SUBI1 KERNEL8x1_SUB1 @@ -532,74 +587,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_SUB1 KERNEL8x1_SUB1 - b .LSGEMM_L8x1_SUB1 + b SGEMM_L8x1_SUB1 -.LSGEMM_L8x1_SUB0: +SGEMM_L8x1_SUB0: andi. L, K, 7 KERNEL8x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x1_SAVE - b .LSGEMM_L8x1_SUB2 + ble SGEMM_L8x1_SAVE + b SGEMM_L8x1_SUB2 -.LSGEMM_L8x1_SUB1: +SGEMM_L8x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x1_SAVE + ble SGEMM_L8x1_SAVE -.LSGEMM_L8x1_SUB2: +SGEMM_L8x1_SUB2: KERNEL8x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x1_SUB2 + bgt SGEMM_L8x1_SUB2 -.LSGEMM_L8x1_SAVE: +SGEMM_L8x1_SAVE: SAVE8x1 -.LSGEMM_L8x1_END: +SGEMM_L8x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LSGEMM_L8_BEGIN + bgt SGEMM_L8_BEGIN andi. T2, N, 7 - ble .L999 + ble L999 -.LSGEMM_L8_END: +SGEMM_L8_END: - b .LSGEMM_L4_BEGIN + b SGEMM_L4_BEGIN -.L999_H1: +L999_H1: - b .L999 + b L999 -.LSGEMM_L4_BEGIN: +SGEMM_L4_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 2 + +SGEMM_L4_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L4_COPYB andi. T1, N, 4 - ble .LSGEMM_L4_END + ble SGEMM_L4_END mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 - ble .LSGEMM_L4x16_END + ble SGEMM_L4x16_END -.LSGEMM_L4x16_BEGIN: +SGEMM_L4x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x16_SUB0 + ble SGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x16_SUB4 + ble SGEMM_L4x16_SUB4 -.LSGEMM_L4x16_LOOP_START: +SGEMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -618,11 +705,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -2 - ble .LSGEMM_L4x16_LOOP_END + ble SGEMM_L4x16_LOOP_END .align 5 -.LSGEMM_L4x16_LOOP: +SGEMM_L4x16_LOOP: KERNEL4x16_1 dcbt AO, PRE @@ -639,9 +726,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -1 - bgt .LSGEMM_L4x16_LOOP + bgt SGEMM_L4x16_LOOP -.LSGEMM_L4x16_LOOP_END: +SGEMM_L4x16_LOOP_END: KERNEL4x16_1 dcbt AO, PRE @@ -656,9 +743,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_1 KERNEL4x16_E2 - b .LSGEMM_L4x16_SUB1 + b SGEMM_L4x16_SUB1 -.LSGEMM_L4x16_SUB4: +SGEMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -672,53 +759,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b .LSGEMM_L4x16_SUB1 + b SGEMM_L4x16_SUB1 -.LSGEMM_L4x16_SUB0: +SGEMM_L4x16_SUB0: andi. L, K, 7 KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x16_SAVE - b .LSGEMM_L4x16_SUB2 + ble SGEMM_L4x16_SAVE + b SGEMM_L4x16_SUB2 -.LSGEMM_L4x16_SUB1: +SGEMM_L4x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x16_SAVE + ble SGEMM_L4x16_SAVE -.LSGEMM_L4x16_SUB2: +SGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x16_SUB2 + bgt SGEMM_L4x16_SUB2 -.LSGEMM_L4x16_SAVE: +SGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt .LSGEMM_L4x16_BEGIN + bgt SGEMM_L4x16_BEGIN -.LSGEMM_L4x16_END: +SGEMM_L4x16_END: -.LSGEMM_L4x8_BEGIN: +SGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L4x1_END + ble SGEMM_L4x1_END andi. T1, M, 8 - ble .LSGEMM_L4x8_END - mr BO, B + ble SGEMM_L4x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x8_SUB0 + ble SGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x8_SUB4 + ble SGEMM_L4x8_SUB4 -.LSGEMM_L4x8_LOOP_START: +SGEMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -732,11 +819,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LSGEMM_L4x8_LOOP_END + ble SGEMM_L4x8_LOOP_END .align 5 -.LSGEMM_L4x8_LOOP: +SGEMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -749,9 +836,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LSGEMM_L4x8_LOOP + bgt SGEMM_L4x8_LOOP -.LSGEMM_L4x8_LOOP_END: +SGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -763,9 +850,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LSGEMM_L4x8_SUB1 + b SGEMM_L4x8_SUB1 -.LSGEMM_L4x8_SUB4: +SGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -777,48 +864,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LSGEMM_L4x8_SUB1 + b SGEMM_L4x8_SUB1 -.LSGEMM_L4x8_SUB0: +SGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x8_SAVE - b .LSGEMM_L4x8_SUB2 + ble SGEMM_L4x8_SAVE + b SGEMM_L4x8_SUB2 -.LSGEMM_L4x8_SUB1: +SGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x8_SAVE + ble SGEMM_L4x8_SAVE -.LSGEMM_L4x8_SUB2: +SGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x8_SUB2 + bgt SGEMM_L4x8_SUB2 -.LSGEMM_L4x8_SAVE: +SGEMM_L4x8_SAVE: SAVE4x8 -.LSGEMM_L4x8_END: +SGEMM_L4x8_END: -.LSGEMM_L4x4_BEGIN: +SGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L4x4_END - mr BO, B + ble SGEMM_L4x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x4_SUB0 + ble SGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x4_SUB4 + ble SGEMM_L4x4_SUB4 -.LSGEMM_L4x4_LOOP_START: +SGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -832,11 +919,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LSGEMM_L4x4_LOOP_END + ble SGEMM_L4x4_LOOP_END .align 5 -.LSGEMM_L4x4_LOOP: +SGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -849,9 +936,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LSGEMM_L4x4_LOOP + bgt SGEMM_L4x4_LOOP -.LSGEMM_L4x4_LOOP_END: +SGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -863,9 +950,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LSGEMM_L4x4_SUB1 + b SGEMM_L4x4_SUB1 -.LSGEMM_L4x4_SUB4: +SGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -877,48 +964,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LSGEMM_L4x4_SUB1 + b SGEMM_L4x4_SUB1 -.LSGEMM_L4x4_SUB0: +SGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x4_SAVE - b .LSGEMM_L4x4_SUB2 + ble SGEMM_L4x4_SAVE + b SGEMM_L4x4_SUB2 -.LSGEMM_L4x4_SUB1: +SGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x4_SAVE + ble SGEMM_L4x4_SAVE -.LSGEMM_L4x4_SUB2: +SGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x4_SUB2 + bgt SGEMM_L4x4_SUB2 -.LSGEMM_L4x4_SAVE: +SGEMM_L4x4_SAVE: SAVE4x4 -.LSGEMM_L4x4_END: +SGEMM_L4x4_END: -.LSGEMM_L4x2_BEGIN: +SGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L4x2_END - mr BO, B + ble SGEMM_L4x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x2_SUB0 + ble SGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x2_SUB4 + ble SGEMM_L4x2_SUB4 -.LSGEMM_L4x2_LOOP_START: +SGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -932,11 +1019,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LSGEMM_L4x2_LOOP_END + ble SGEMM_L4x2_LOOP_END .align 5 -.LSGEMM_L4x2_LOOP: +SGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -949,9 +1036,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LSGEMM_L4x2_LOOP + bgt SGEMM_L4x2_LOOP -.LSGEMM_L4x2_LOOP_END: +SGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -963,9 +1050,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LSGEMM_L4x2_SUB1 + b SGEMM_L4x2_SUB1 -.LSGEMM_L4x2_SUB4: +SGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -977,48 +1064,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LSGEMM_L4x2_SUB1 + b SGEMM_L4x2_SUB1 -.LSGEMM_L4x2_SUB0: +SGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x2_SAVE - b .LSGEMM_L4x2_SUB2 + ble SGEMM_L4x2_SAVE + b SGEMM_L4x2_SUB2 -.LSGEMM_L4x2_SUB1: +SGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x2_SAVE + ble SGEMM_L4x2_SAVE -.LSGEMM_L4x2_SUB2: +SGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x2_SUB2 + bgt SGEMM_L4x2_SUB2 -.LSGEMM_L4x2_SAVE: +SGEMM_L4x2_SAVE: SAVE4x2 -.LSGEMM_L4x2_END: +SGEMM_L4x2_END: -.LSGEMM_L4x1_BEGIN: +SGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L4x1_END - mr BO, B + ble SGEMM_L4x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x1_SUB0 + ble SGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x1_SUB4 + ble SGEMM_L4x1_SUB4 -.LSGEMM_L4x1_LOOP_START: +SGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -1032,11 +1119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LSGEMM_L4x1_LOOP_END + ble SGEMM_L4x1_LOOP_END .align 5 -.LSGEMM_L4x1_LOOP: +SGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -1049,9 +1136,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LSGEMM_L4x1_LOOP + bgt SGEMM_L4x1_LOOP -.LSGEMM_L4x1_LOOP_END: +SGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -1063,9 +1150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LSGEMM_L4x1_SUB1 + b SGEMM_L4x1_SUB1 -.LSGEMM_L4x1_SUB4: +SGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -1077,61 +1164,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LSGEMM_L4x1_SUB1 + b SGEMM_L4x1_SUB1 -.LSGEMM_L4x1_SUB0: +SGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x1_SAVE - b .LSGEMM_L4x1_SUB2 + ble SGEMM_L4x1_SAVE + b SGEMM_L4x1_SUB2 -.LSGEMM_L4x1_SUB1: +SGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x1_SAVE + ble SGEMM_L4x1_SAVE -.LSGEMM_L4x1_SUB2: +SGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x1_SUB2 + bgt SGEMM_L4x1_SUB2 -.LSGEMM_L4x1_SAVE: +SGEMM_L4x1_SAVE: SAVE4x1 -.LSGEMM_L4x1_END: +SGEMM_L4x1_END: slwi T1, K, 4 add B, B, T1 -.LSGEMM_L4_END: -.LSGEMM_L2_BEGIN: +SGEMM_L4_END: +SGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +SGEMM_L2_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L2_COPYB andi. T1, N, 2 - ble .LSGEMM_L2_END + ble SGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble .LSGEMM_L2x16_END + ble SGEMM_L2x16_END -.LSGEMM_L2x16_BEGIN: +SGEMM_L2x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x16_SUB0 + ble SGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x16_SUB4 + ble SGEMM_L2x16_SUB4 -.LSGEMM_L2x16_LOOP_START: +SGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -1150,11 +1269,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LSGEMM_L2x16_LOOP_END + ble SGEMM_L2x16_LOOP_END .align 5 -.LSGEMM_L2x16_LOOP: +SGEMM_L2x16_LOOP: KERNEL2x16_1 dcbt AO, PRE @@ -1171,9 +1290,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LSGEMM_L2x16_LOOP + bgt SGEMM_L2x16_LOOP -.LSGEMM_L2x16_LOOP_END: +SGEMM_L2x16_LOOP_END: KERNEL2x16_1 dcbt AO, PRE @@ -1188,9 +1307,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LSGEMM_L2x16_SUB1 + b SGEMM_L2x16_SUB1 -.LSGEMM_L2x16_SUB4: +SGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -1204,53 +1323,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LSGEMM_L2x16_SUB1 + b SGEMM_L2x16_SUB1 -.LSGEMM_L2x16_SUB0: +SGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x16_SAVE - b .LSGEMM_L2x16_SUB2 + ble SGEMM_L2x16_SAVE + b SGEMM_L2x16_SUB2 -.LSGEMM_L2x16_SUB1: +SGEMM_L2x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x16_SAVE + ble SGEMM_L2x16_SAVE -.LSGEMM_L2x16_SUB2: +SGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x16_SUB2 + bgt SGEMM_L2x16_SUB2 -.LSGEMM_L2x16_SAVE: +SGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt .LSGEMM_L2x16_BEGIN + bgt SGEMM_L2x16_BEGIN -.LSGEMM_L2x16_END: +SGEMM_L2x16_END: -.LSGEMM_L2x8_BEGIN: +SGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L2x1_END + ble SGEMM_L2x1_END andi. T1, M, 8 - ble .LSGEMM_L2x8_END - mr BO, B + ble SGEMM_L2x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x8_SUB0 + ble SGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x8_SUB4 + ble SGEMM_L2x8_SUB4 -.LSGEMM_L2x8_LOOP_START: +SGEMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -1264,11 +1383,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LSGEMM_L2x8_LOOP_END + ble SGEMM_L2x8_LOOP_END .align 5 -.LSGEMM_L2x8_LOOP: +SGEMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -1281,9 +1400,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LSGEMM_L2x8_LOOP + bgt SGEMM_L2x8_LOOP -.LSGEMM_L2x8_LOOP_END: +SGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -1295,9 +1414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LSGEMM_L2x8_SUB1 + b SGEMM_L2x8_SUB1 -.LSGEMM_L2x8_SUB4: +SGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -1309,48 +1428,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LSGEMM_L2x8_SUB1 + b SGEMM_L2x8_SUB1 -.LSGEMM_L2x8_SUB0: +SGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x8_SAVE - b .LSGEMM_L2x8_SUB2 + ble SGEMM_L2x8_SAVE + b SGEMM_L2x8_SUB2 -.LSGEMM_L2x8_SUB1: +SGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x8_SAVE + ble SGEMM_L2x8_SAVE -.LSGEMM_L2x8_SUB2: +SGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x8_SUB2 + bgt SGEMM_L2x8_SUB2 -.LSGEMM_L2x8_SAVE: +SGEMM_L2x8_SAVE: SAVE2x8 -.LSGEMM_L2x8_END: +SGEMM_L2x8_END: -.LSGEMM_L2x4_BEGIN: +SGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L2x4_END - mr BO, B + ble SGEMM_L2x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x4_SUB0 + ble SGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x4_SUB4 + ble SGEMM_L2x4_SUB4 -.LSGEMM_L2x4_LOOP_START: +SGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -1364,11 +1483,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LSGEMM_L2x4_LOOP_END + ble SGEMM_L2x4_LOOP_END .align 5 -.LSGEMM_L2x4_LOOP: +SGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -1381,9 +1500,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LSGEMM_L2x4_LOOP + bgt SGEMM_L2x4_LOOP -.LSGEMM_L2x4_LOOP_END: +SGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -1395,9 +1514,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LSGEMM_L2x4_SUB1 + b SGEMM_L2x4_SUB1 -.LSGEMM_L2x4_SUB4: +SGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -1409,48 +1528,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LSGEMM_L2x4_SUB1 + b SGEMM_L2x4_SUB1 -.LSGEMM_L2x4_SUB0: +SGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x4_SAVE - b .LSGEMM_L2x4_SUB2 + ble SGEMM_L2x4_SAVE + b SGEMM_L2x4_SUB2 -.LSGEMM_L2x4_SUB1: +SGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x4_SAVE + ble SGEMM_L2x4_SAVE -.LSGEMM_L2x4_SUB2: +SGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x4_SUB2 + bgt SGEMM_L2x4_SUB2 -.LSGEMM_L2x4_SAVE: +SGEMM_L2x4_SAVE: SAVE2x4 -.LSGEMM_L2x4_END: +SGEMM_L2x4_END: -.LSGEMM_L2x2_BEGIN: +SGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L2x2_END - mr BO, B + ble SGEMM_L2x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x2_SUB0 + ble SGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x2_SUB4 + ble SGEMM_L2x2_SUB4 -.LSGEMM_L2x2_LOOP_START: +SGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -1464,11 +1583,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LSGEMM_L2x2_LOOP_END + ble SGEMM_L2x2_LOOP_END .align 5 -.LSGEMM_L2x2_LOOP: +SGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -1481,9 +1600,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LSGEMM_L2x2_LOOP + bgt SGEMM_L2x2_LOOP -.LSGEMM_L2x2_LOOP_END: +SGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -1495,9 +1614,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LSGEMM_L2x2_SUB1 + b SGEMM_L2x2_SUB1 -.LSGEMM_L2x2_SUB4: +SGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1509,48 +1628,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LSGEMM_L2x2_SUB1 + b SGEMM_L2x2_SUB1 -.LSGEMM_L2x2_SUB0: +SGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x2_SAVE - b .LSGEMM_L2x2_SUB2 + ble SGEMM_L2x2_SAVE + b SGEMM_L2x2_SUB2 -.LSGEMM_L2x2_SUB1: +SGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x2_SAVE + ble SGEMM_L2x2_SAVE -.LSGEMM_L2x2_SUB2: +SGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x2_SUB2 + bgt SGEMM_L2x2_SUB2 -.LSGEMM_L2x2_SAVE: +SGEMM_L2x2_SAVE: SAVE2x2 -.LSGEMM_L2x2_END: +SGEMM_L2x2_END: -.LSGEMM_L2x1_BEGIN: +SGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L2x1_END - mr BO, B + ble SGEMM_L2x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x1_SUB0 + ble SGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x1_SUB4 + ble SGEMM_L2x1_SUB4 -.LSGEMM_L2x1_LOOP_START: +SGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1564,11 +1683,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LSGEMM_L2x1_LOOP_END + ble SGEMM_L2x1_LOOP_END .align 5 -.LSGEMM_L2x1_LOOP: +SGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1581,9 +1700,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LSGEMM_L2x1_LOOP + bgt SGEMM_L2x1_LOOP -.LSGEMM_L2x1_LOOP_END: +SGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1595,9 +1714,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LSGEMM_L2x1_SUB1 + b SGEMM_L2x1_SUB1 -.LSGEMM_L2x1_SUB4: +SGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1609,59 +1728,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LSGEMM_L2x1_SUB1 + b SGEMM_L2x1_SUB1 -.LSGEMM_L2x1_SUB0: +SGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x1_SAVE - b .LSGEMM_L2x1_SUB2 + ble SGEMM_L2x1_SAVE + b SGEMM_L2x1_SUB2 -.LSGEMM_L2x1_SUB1: +SGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x1_SAVE + ble SGEMM_L2x1_SAVE -.LSGEMM_L2x1_SUB2: +SGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x1_SUB2 + bgt SGEMM_L2x1_SUB2 -.LSGEMM_L2x1_SAVE: +SGEMM_L2x1_SAVE: SAVE2x1 -.LSGEMM_L2x1_END: +SGEMM_L2x1_END: slwi T1, K, 3 add B, B, T1 -.LSGEMM_L2_END: -.LSGEMM_L1_BEGIN: +SGEMM_L2_END: +SGEMM_L1_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 0 + +SGEMM_L1_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L1_COPYB andi. T1, N, 1 - ble .LSGEMM_L1_END + ble SGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble .LSGEMM_L1x16_END + ble SGEMM_L1x16_END -.LSGEMM_L1x16_BEGIN: +SGEMM_L1x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x16_SUB0 + ble SGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x16_SUB4 + ble SGEMM_L1x16_SUB4 -.LSGEMM_L1x16_LOOP_START: +SGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1680,11 +1831,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LSGEMM_L1x16_LOOP_END + ble SGEMM_L1x16_LOOP_END .align 5 -.LSGEMM_L1x16_LOOP: +SGEMM_L1x16_LOOP: KERNEL1x16_1 dcbt AO, PRE @@ -1701,9 +1852,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LSGEMM_L1x16_LOOP + bgt SGEMM_L1x16_LOOP -.LSGEMM_L1x16_LOOP_END: +SGEMM_L1x16_LOOP_END: KERNEL1x16_1 dcbt AO, PRE @@ -1718,9 +1869,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LSGEMM_L1x16_SUB1 + b SGEMM_L1x16_SUB1 -.LSGEMM_L1x16_SUB4: +SGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1734,53 +1885,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LSGEMM_L1x16_SUB1 + b SGEMM_L1x16_SUB1 -.LSGEMM_L1x16_SUB0: +SGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x16_SAVE - b .LSGEMM_L1x16_SUB2 + ble SGEMM_L1x16_SAVE + b SGEMM_L1x16_SUB2 -.LSGEMM_L1x16_SUB1: +SGEMM_L1x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x16_SAVE + ble SGEMM_L1x16_SAVE -.LSGEMM_L1x16_SUB2: +SGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x16_SUB2 + bgt SGEMM_L1x16_SUB2 -.LSGEMM_L1x16_SAVE: +SGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt .LSGEMM_L1x16_BEGIN + bgt SGEMM_L1x16_BEGIN -.LSGEMM_L1x16_END: +SGEMM_L1x16_END: -.LSGEMM_L1x8_BEGIN: +SGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L1x1_END + ble SGEMM_L1x1_END andi. T1, M, 8 - ble .LSGEMM_L1x8_END - mr BO, B + ble SGEMM_L1x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x8_SUB0 + ble SGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x8_SUB4 + ble SGEMM_L1x8_SUB4 -.LSGEMM_L1x8_LOOP_START: +SGEMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1794,11 +1945,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LSGEMM_L1x8_LOOP_END + ble SGEMM_L1x8_LOOP_END .align 5 -.LSGEMM_L1x8_LOOP: +SGEMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1811,9 +1962,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LSGEMM_L1x8_LOOP + bgt SGEMM_L1x8_LOOP -.LSGEMM_L1x8_LOOP_END: +SGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1825,9 +1976,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LSGEMM_L1x8_SUB1 + b SGEMM_L1x8_SUB1 -.LSGEMM_L1x8_SUB4: +SGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1839,48 +1990,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LSGEMM_L1x8_SUB1 + b SGEMM_L1x8_SUB1 -.LSGEMM_L1x8_SUB0: +SGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x8_SAVE - b .LSGEMM_L1x8_SUB2 + ble SGEMM_L1x8_SAVE + b SGEMM_L1x8_SUB2 -.LSGEMM_L1x8_SUB1: +SGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x8_SAVE + ble SGEMM_L1x8_SAVE -.LSGEMM_L1x8_SUB2: +SGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x8_SUB2 + bgt SGEMM_L1x8_SUB2 -.LSGEMM_L1x8_SAVE: +SGEMM_L1x8_SAVE: SAVE1x8 -.LSGEMM_L1x8_END: +SGEMM_L1x8_END: -.LSGEMM_L1x4_BEGIN: +SGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L1x4_END - mr BO, B + ble SGEMM_L1x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x4_SUB0 + ble SGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x4_SUB4 + ble SGEMM_L1x4_SUB4 -.LSGEMM_L1x4_LOOP_START: +SGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1894,11 +2045,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LSGEMM_L1x4_LOOP_END + ble SGEMM_L1x4_LOOP_END .align 5 -.LSGEMM_L1x4_LOOP: +SGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1911,9 +2062,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LSGEMM_L1x4_LOOP + bgt SGEMM_L1x4_LOOP -.LSGEMM_L1x4_LOOP_END: +SGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1925,9 +2076,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LSGEMM_L1x4_SUB1 + b SGEMM_L1x4_SUB1 -.LSGEMM_L1x4_SUB4: +SGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1939,48 +2090,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LSGEMM_L1x4_SUB1 + b SGEMM_L1x4_SUB1 -.LSGEMM_L1x4_SUB0: +SGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x4_SAVE - b .LSGEMM_L1x4_SUB2 + ble SGEMM_L1x4_SAVE + b SGEMM_L1x4_SUB2 -.LSGEMM_L1x4_SUB1: +SGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x4_SAVE + ble SGEMM_L1x4_SAVE -.LSGEMM_L1x4_SUB2: +SGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x4_SUB2 + bgt SGEMM_L1x4_SUB2 -.LSGEMM_L1x4_SAVE: +SGEMM_L1x4_SAVE: SAVE1x4 -.LSGEMM_L1x4_END: +SGEMM_L1x4_END: -.LSGEMM_L1x2_BEGIN: +SGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L1x2_END - mr BO, B + ble SGEMM_L1x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x2_SUB0 + ble SGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x2_SUB4 + ble SGEMM_L1x2_SUB4 -.LSGEMM_L1x2_LOOP_START: +SGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1994,11 +2145,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LSGEMM_L1x2_LOOP_END + ble SGEMM_L1x2_LOOP_END .align 5 -.LSGEMM_L1x2_LOOP: +SGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -2011,9 +2162,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LSGEMM_L1x2_LOOP + bgt SGEMM_L1x2_LOOP -.LSGEMM_L1x2_LOOP_END: +SGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -2025,9 +2176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LSGEMM_L1x2_SUB1 + b SGEMM_L1x2_SUB1 -.LSGEMM_L1x2_SUB4: +SGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -2039,48 +2190,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LSGEMM_L1x2_SUB1 + b SGEMM_L1x2_SUB1 -.LSGEMM_L1x2_SUB0: +SGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x2_SAVE - b .LSGEMM_L1x2_SUB2 + ble SGEMM_L1x2_SAVE + b SGEMM_L1x2_SUB2 -.LSGEMM_L1x2_SUB1: +SGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x2_SAVE + ble SGEMM_L1x2_SAVE -.LSGEMM_L1x2_SUB2: +SGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x2_SUB2 + bgt SGEMM_L1x2_SUB2 -.LSGEMM_L1x2_SAVE: +SGEMM_L1x2_SAVE: SAVE1x2 -.LSGEMM_L1x2_END: +SGEMM_L1x2_END: -.LSGEMM_L1x1_BEGIN: +SGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L1x1_END - mr BO, B + ble SGEMM_L1x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x1_SUB0 + ble SGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x1_SUB4 + ble SGEMM_L1x1_SUB4 -.LSGEMM_L1x1_LOOP_START: +SGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -2094,11 +2245,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LSGEMM_L1x1_LOOP_END + ble SGEMM_L1x1_LOOP_END .align 5 -.LSGEMM_L1x1_LOOP: +SGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -2111,9 +2262,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LSGEMM_L1x1_LOOP + bgt SGEMM_L1x1_LOOP -.LSGEMM_L1x1_LOOP_END: +SGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -2125,9 +2276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LSGEMM_L1x1_SUB1 + b SGEMM_L1x1_SUB1 -.LSGEMM_L1x1_SUB4: +SGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -2139,34 +2290,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LSGEMM_L1x1_SUB1 + b SGEMM_L1x1_SUB1 -.LSGEMM_L1x1_SUB0: +SGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x1_SAVE - b .LSGEMM_L1x1_SUB2 + ble SGEMM_L1x1_SAVE + b SGEMM_L1x1_SUB2 -.LSGEMM_L1x1_SUB1: +SGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x1_SAVE + ble SGEMM_L1x1_SAVE -.LSGEMM_L1x1_SUB2: +SGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x1_SUB2 + bgt SGEMM_L1x1_SUB2 -.LSGEMM_L1x1_SAVE: +SGEMM_L1x1_SAVE: SAVE1x1 -.LSGEMM_L1x1_END: +SGEMM_L1x1_END: -.LSGEMM_L1_END: +SGEMM_L1_END: diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index a2d36c089..71dc52979 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -26,13 +26,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ + /********************************************************************************************** * Macros for N=8 and M=16 **********************************************************************************************/ @@ -46,21 +47,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 .endm @@ -74,21 +75,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -136,42 +137,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_1 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 - lxvw4x vs28, o0, BO - lxvw4x vs4, o0, AO xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 - lxvw4x vs29, o16, BO - lxvw4x vs5, o16, AO xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 - lxvw4x vs6, o32, AO - lxvw4x vs7, o48, AO xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 - xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 - xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 @@ -184,8 +194,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 - addi AO, AO, 64 - addi BO, BO, 32 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 @@ -199,47 +207,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_2 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + + addi BO, BO, 128 + + xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 - - lxvw4x vs28, o0, BO - lxvw4x vs0, o0, AO - xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 - - lxvw4x vs29, o16, BO - lxvw4x vs1, o16, AO - xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 - lxvw4x vs2, o32, AO - lxvw4x vs3, o48, AO - xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 - xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 - xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 @@ -257,8 +269,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 - addi AO, AO, 64 - addi BO, BO, 32 xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 @@ -321,21 +331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -391,21 +401,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -464,106 +474,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -581,106 +503,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr #endif - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -698,106 +532,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs40, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr #endif - stxvw4x vs41, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs42, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs43, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -815,106 +561,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs44, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr #endif - stxvw4x vs45, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs46, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs47, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -932,106 +590,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs48, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr #endif - stxvw4x vs49, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs50, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs51, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1049,106 +619,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs52, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr #endif - stxvw4x vs53, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs54, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs55, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1166,106 +648,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs56, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr #endif - stxvw4x vs57, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs58, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs59, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1283,106 +677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs60, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr #endif - stxvw4x vs61, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs62, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs63, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1406,21 +712,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 .endm @@ -1432,21 +738,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -1484,21 +790,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -1536,21 +842,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs4, vs16 @@ -1618,21 +924,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -1670,21 +976,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -1725,58 +1031,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1790,58 +1052,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr #endif - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1855,58 +1073,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr #endif - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1920,58 +1094,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr #endif - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1985,58 +1115,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs40, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr #endif - stxvw4x vs41, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2050,58 +1136,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs42, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr #endif - stxvw4x vs43, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2115,58 +1157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs44, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr #endif - stxvw4x vs45, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2180,58 +1178,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs46, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr #endif - stxvw4x vs47, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2252,21 +1206,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 .endm @@ -2277,21 +1231,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -2320,21 +1274,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -2363,21 +1317,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs4, vs16 @@ -2428,21 +1382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -2471,21 +1425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -2517,34 +1471,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2556,34 +1488,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2595,34 +1505,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2634,34 +1522,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2673,34 +1539,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2712,34 +1556,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs37, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs37, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2751,34 +1573,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs38, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs38, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2790,34 +1590,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs39, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2841,18 +1619,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 .endm @@ -2867,43 +1646,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 - - addi BO, BO, 32 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + addi BO, BO, 128 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 - xsmulsp vs40, vs0, vs12 - xsmulsp vs41, vs1, vs12 + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 - xsmulsp vs42, vs0, vs13 - xsmulsp vs43, vs1, vs13 + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 - xsmulsp vs44, vs0, vs14 - xsmulsp vs45, vs1, vs14 + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 - xsmulsp vs46, vs0, vs15 - xsmulsp vs47, vs1, vs15 + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 .endm @@ -2919,43 +1699,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 - - addi BO, BO, 32 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + addi BO, BO, 128 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 - xsmaddasp vs40, vs0, vs12 - xsmaddasp vs41, vs1, vs12 + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 - xsmaddasp vs42, vs0, vs13 - xsmaddasp vs43, vs1, vs13 + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 - xsmaddasp vs44, vs0, vs14 - xsmaddasp vs45, vs1, vs14 + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 - xsmaddasp vs46, vs0, vs15 - xsmaddasp vs47, vs1, vs15 + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 .endm @@ -2971,43 +1752,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 - - addi BO, BO, 32 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + addi BO, BO, 128 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 - xsmaddasp vs40, vs4, vs20 - xsmaddasp vs41, vs5, vs20 + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 - xsmaddasp vs42, vs4, vs21 - xsmaddasp vs43, vs5, vs21 + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 - xsmaddasp vs44, vs4, vs22 - xsmaddasp vs45, vs5, vs22 + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 - xsmaddasp vs46, vs4, vs23 - xsmaddasp vs47, vs5, vs23 + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 .endm @@ -3015,29 +1797,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 - xsmaddasp vs40, vs4, vs20 - xsmaddasp vs41, vs5, vs20 + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 - xsmaddasp vs42, vs4, vs21 - xsmaddasp vs43, vs5, vs21 + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 - xsmaddasp vs44, vs4, vs22 - xsmaddasp vs45, vs5, vs22 + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 - xsmaddasp vs46, vs4, vs23 - xsmaddasp vs47, vs5, vs23 + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 .endm @@ -3053,43 +1835,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 - - addi BO, BO, 32 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + addi BO, BO, 128 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 - xsmulsp vs40, vs0, vs12 - xsmulsp vs41, vs1, vs12 + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 - xsmulsp vs42, vs0, vs13 - xsmulsp vs43, vs1, vs13 + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 - xsmulsp vs44, vs0, vs14 - xsmulsp vs45, vs1, vs14 + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 - xsmulsp vs46, vs0, vs15 - xsmulsp vs47, vs1, vs15 + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 .endm @@ -3105,43 +1888,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 - - addi BO, BO, 32 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + addi BO, BO, 128 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 - xsmaddasp vs40, vs0, vs12 - xsmaddasp vs41, vs1, vs12 + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 - xsmaddasp vs42, vs0, vs13 - xsmaddasp vs43, vs1, vs13 + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 - xsmaddasp vs44, vs0, vs14 - xsmaddasp vs45, vs1, vs14 + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 - xsmaddasp vs46, vs0, vs15 - xsmaddasp vs47, vs1, vs15 + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 .endm @@ -3158,17 +1942,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -3185,17 +1963,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - xsmulsp vs1, vs35, alpha_r - + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs35, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -3212,17 +1984,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs36, alpha_r - xsmulsp vs1, vs37, alpha_r - + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r #else - - xsmulsp vs28, vs36, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs37, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 @@ -3239,17 +2005,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs38, alpha_r - xsmulsp vs1, vs39, alpha_r - + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r #else - - xsmulsp vs28, vs38, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs39, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 @@ -3266,17 +2026,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs40, alpha_r - xsmulsp vs1, vs41, alpha_r - + xsmuldp vs0, vs40, alpha_r + xsmuldp vs1, vs41, alpha_r #else - - xsmulsp vs28, vs40, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs41, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs40, alpha_r + xsmaddadp vs1, vs41, alpha_r #endif stxsspx vs0, o0, T1 @@ -3293,17 +2047,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs42, alpha_r - xsmulsp vs1, vs43, alpha_r - + xsmuldp vs0, vs42, alpha_r + xsmuldp vs1, vs43, alpha_r #else - - xsmulsp vs28, vs42, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs43, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs42, alpha_r + xsmaddadp vs1, vs43, alpha_r #endif stxsspx vs0, o0, T1 @@ -3320,17 +2068,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs44, alpha_r - xsmulsp vs1, vs45, alpha_r - + xsmuldp vs0, vs44, alpha_r + xsmuldp vs1, vs45, alpha_r #else - - xsmulsp vs28, vs44, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs45, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs44, alpha_r + xsmaddadp vs1, vs45, alpha_r #endif stxsspx vs0, o0, T1 @@ -3347,17 +2089,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs46, alpha_r - xsmulsp vs1, vs47, alpha_r - + xsmuldp vs0, vs46, alpha_r + xsmuldp vs1, vs47, alpha_r #else - - xsmulsp vs28, vs46, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs47, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs46, alpha_r + xsmaddadp vs1, vs47, alpha_r #endif stxsspx vs0, o0, T1 @@ -3383,18 +2119,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 .endm @@ -3408,35 +2145,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 - - addi BO, BO, 32 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - xsmulsp vs32, vs0, vs8 + addi BO, BO, 128 - xsmulsp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs32, vs0, vs8 - xsmulsp vs35, vs0, vs11 + xsmuldp vs33, vs0, vs9 - xsmulsp vs36, vs0, vs12 + xsmuldp vs34, vs0, vs10 - xsmulsp vs37, vs0, vs13 + xsmuldp vs35, vs0, vs11 - xsmulsp vs38, vs0, vs14 + xsmuldp vs36, vs0, vs12 - xsmulsp vs39, vs0, vs15 + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 .endm @@ -3451,35 +2189,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 - - addi BO, BO, 32 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - xsmaddasp vs32, vs0, vs8 + addi BO, BO, 128 - xsmaddasp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs33, vs0, vs9 - xsmaddasp vs36, vs0, vs12 + xsmaddadp vs34, vs0, vs10 - xsmaddasp vs37, vs0, vs13 + xsmaddadp vs35, vs0, vs11 - xsmaddasp vs38, vs0, vs14 + xsmaddadp vs36, vs0, vs12 - xsmaddasp vs39, vs0, vs15 + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 .endm @@ -3494,35 +2233,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 - - addi BO, BO, 32 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - xsmaddasp vs32, vs4, vs16 + addi BO, BO, 128 - xsmaddasp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs33, vs4, vs17 - xsmaddasp vs36, vs4, vs20 + xsmaddadp vs34, vs4, vs18 - xsmaddasp vs37, vs4, vs21 + xsmaddadp vs35, vs4, vs19 - xsmaddasp vs38, vs4, vs22 + xsmaddadp vs36, vs4, vs20 - xsmaddasp vs39, vs4, vs23 + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 .endm @@ -3530,21 +2270,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs34, vs4, vs18 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs35, vs4, vs19 - xsmaddasp vs36, vs4, vs20 + xsmaddadp vs36, vs4, vs20 - xsmaddasp vs37, vs4, vs21 + xsmaddadp vs37, vs4, vs21 - xsmaddasp vs38, vs4, vs22 + xsmaddadp vs38, vs4, vs22 - xsmaddasp vs39, vs4, vs23 + xsmaddadp vs39, vs4, vs23 .endm @@ -3559,35 +2299,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 - - addi BO, BO, 32 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - xsmulsp vs32, vs0, vs8 + addi BO, BO, 128 - xsmulsp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs32, vs0, vs8 - xsmulsp vs35, vs0, vs11 + xsmuldp vs33, vs0, vs9 - xsmulsp vs36, vs0, vs12 + xsmuldp vs34, vs0, vs10 - xsmulsp vs37, vs0, vs13 + xsmuldp vs35, vs0, vs11 - xsmulsp vs38, vs0, vs14 + xsmuldp vs36, vs0, vs12 - xsmulsp vs39, vs0, vs15 + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 .endm @@ -3602,35 +2343,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 - - addi BO, BO, 32 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - xsmaddasp vs32, vs0, vs8 + addi BO, BO, 128 - xsmaddasp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs33, vs0, vs9 - xsmaddasp vs36, vs0, vs12 + xsmaddadp vs34, vs0, vs10 - xsmaddasp vs37, vs0, vs13 + xsmaddadp vs35, vs0, vs11 - xsmaddasp vs38, vs0, vs14 + xsmaddadp vs36, vs0, vs12 - xsmaddasp vs39, vs0, vs15 + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 .endm @@ -3646,14 +2388,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 @@ -3668,14 +2405,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs33, alpha_r - + xsmuldp vs0, vs33, alpha_r #else - - xsmulsp vs28, vs33, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -3690,14 +2422,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - + xsmuldp vs0, vs34, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 @@ -3712,14 +2439,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs35, alpha_r - + xsmuldp vs0, vs35, alpha_r #else - - xsmulsp vs28, vs35, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -3734,14 +2456,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs36, alpha_r - + xsmuldp vs0, vs36, alpha_r #else - - xsmulsp vs28, vs36, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs36, alpha_r #endif stxsspx vs0, o0, T1 @@ -3756,14 +2473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs37, alpha_r - + xsmuldp vs0, vs37, alpha_r #else - - xsmulsp vs28, vs37, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs37, alpha_r #endif stxsspx vs0, o0, T1 @@ -3778,14 +2490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs38, alpha_r - + xsmuldp vs0, vs38, alpha_r #else - - xsmulsp vs28, vs38, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs38, alpha_r #endif stxsspx vs0, o0, T1 @@ -3800,14 +2507,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs39, alpha_r - + xsmuldp vs0, vs39, alpha_r #else - - xsmulsp vs28, vs39, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs39, alpha_r #endif stxsspx vs0, o0, T1 @@ -3832,14 +2534,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 .endm @@ -3853,14 +2555,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -3896,14 +2598,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -3939,14 +2641,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 @@ -4008,14 +2710,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -4051,14 +2753,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -4097,106 +2799,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4214,106 +2828,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr #endif - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4331,106 +2857,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs40, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr #endif - stxvw4x vs41, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs42, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs43, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4448,106 +2886,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs44, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr #endif - stxvw4x vs45, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs46, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs47, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4571,14 +2921,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 .endm @@ -4590,14 +2940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -4623,14 +2973,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -4656,14 +3006,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 @@ -4707,14 +3057,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -4740,14 +3090,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -4776,58 +3126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4841,58 +3147,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr #endif - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4906,58 +3168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr #endif - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4971,58 +3189,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr #endif - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -5043,14 +3217,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 .endm @@ -5061,14 +3235,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -5089,14 +3263,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -5117,14 +3291,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 @@ -5159,14 +3333,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -5187,14 +3361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -5218,34 +3392,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5257,34 +3409,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5296,34 +3426,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5335,34 +3443,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5386,11 +3472,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 .endm @@ -5405,24 +3492,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 - - addi BO, BO, 16 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + addi BO, BO, 64 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 .endm @@ -5438,24 +3526,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 - - addi BO, BO, 16 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + addi BO, BO, 64 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 .endm @@ -5471,24 +3560,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 - - addi BO, BO, 16 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + addi BO, BO, 64 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 .endm @@ -5496,17 +3586,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 .endm @@ -5522,24 +3612,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 - - addi BO, BO, 16 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + addi BO, BO, 64 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 .endm @@ -5555,24 +3646,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 - - addi BO, BO, 16 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + addi BO, BO, 64 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 .endm @@ -5589,17 +3681,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -5616,17 +3702,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - xsmulsp vs1, vs35, alpha_r - + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs35, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -5643,17 +3723,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs36, alpha_r - xsmulsp vs1, vs37, alpha_r - + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r #else - - xsmulsp vs28, vs36, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs37, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 @@ -5670,17 +3744,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs38, alpha_r - xsmulsp vs1, vs39, alpha_r - + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r #else - - xsmulsp vs28, vs38, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs39, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 @@ -5706,11 +3774,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 .endm @@ -5724,20 +3793,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 - - addi BO, BO, 16 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - xsmulsp vs32, vs0, vs8 + addi BO, BO, 64 - xsmulsp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs32, vs0, vs8 - xsmulsp vs35, vs0, vs11 + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 .endm @@ -5752,20 +3822,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 - - addi BO, BO, 16 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - xsmaddasp vs32, vs0, vs8 + addi BO, BO, 64 - xsmaddasp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 .endm @@ -5780,20 +3851,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 - - addi BO, BO, 16 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - xsmaddasp vs32, vs4, vs16 + addi BO, BO, 64 - xsmaddasp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 .endm @@ -5801,13 +3873,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs34, vs4, vs18 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs35, vs4, vs19 .endm @@ -5822,20 +3894,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 - - addi BO, BO, 16 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - xsmulsp vs32, vs0, vs8 + addi BO, BO, 64 - xsmulsp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs32, vs0, vs8 - xsmulsp vs35, vs0, vs11 + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 .endm @@ -5850,20 +3923,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 - - addi BO, BO, 16 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - xsmaddasp vs32, vs0, vs8 + addi BO, BO, 64 - xsmaddasp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 .endm @@ -5879,14 +3953,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 @@ -5901,14 +3970,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs33, alpha_r - + xsmuldp vs0, vs33, alpha_r #else - - xsmulsp vs28, vs33, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -5923,14 +3987,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - + xsmuldp vs0, vs34, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 @@ -5945,14 +4004,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs35, alpha_r - + xsmuldp vs0, vs35, alpha_r #else - - xsmulsp vs28, vs35, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -5977,12 +4031,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -5996,12 +4050,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6027,12 +4081,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6058,12 +4112,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 @@ -6105,12 +4159,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6136,12 +4190,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6170,106 +4224,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -6287,106 +4253,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr #endif - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -6410,12 +4288,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6427,12 +4305,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6452,12 +4330,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6477,12 +4355,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 @@ -6514,12 +4392,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6539,12 +4417,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6567,58 +4445,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -6632,58 +4466,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr #endif - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -6704,12 +4494,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6720,12 +4510,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6742,12 +4532,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6764,12 +4554,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 @@ -6796,12 +4586,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6818,12 +4608,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6843,34 +4633,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -6882,34 +4650,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -6933,9 +4679,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 .endm @@ -6950,16 +4697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - - addi BO, BO, 8 + lxsspx vs17, o16, T1 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + addi BO, BO, 32 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 .endm @@ -6975,16 +4723,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - - addi BO, BO, 8 + lxsspx vs17, o16, T1 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + addi BO, BO, 32 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 .endm @@ -7000,16 +4749,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - - addi BO, BO, 8 + lxsspx vs9, o16, T1 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + addi BO, BO, 32 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 .endm @@ -7017,11 +4767,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 .endm @@ -7037,16 +4787,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - - addi BO, BO, 8 + lxsspx vs9, o16, T1 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + addi BO, BO, 32 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 .endm @@ -7062,16 +4813,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - - addi BO, BO, 8 + lxsspx vs9, o16, T1 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + addi BO, BO, 32 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 .endm @@ -7088,17 +4840,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -7115,17 +4861,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - xsmulsp vs1, vs35, alpha_r - + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs35, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -7151,9 +4891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 .endm @@ -7167,14 +4908,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - - addi BO, BO, 8 + lxsspx vs17, o16, T1 - xsmulsp vs32, vs0, vs8 + addi BO, BO, 32 - xsmulsp vs33, vs0, vs9 + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 .endm @@ -7189,14 +4931,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - - addi BO, BO, 8 + lxsspx vs17, o16, T1 - xsmaddasp vs32, vs0, vs8 + addi BO, BO, 32 - xsmaddasp vs33, vs0, vs9 + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 .endm @@ -7211,14 +4954,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - - addi BO, BO, 8 + lxsspx vs9, o16, T1 - xsmaddasp vs32, vs4, vs16 + addi BO, BO, 32 - xsmaddasp vs33, vs4, vs17 + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 .endm @@ -7226,9 +4970,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 .endm @@ -7243,14 +4987,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - - addi BO, BO, 8 + lxsspx vs9, o16, T1 - xsmulsp vs32, vs0, vs8 + addi BO, BO, 32 - xsmulsp vs33, vs0, vs9 + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 .endm @@ -7265,14 +5010,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - - addi BO, BO, 8 + lxsspx vs9, o16, T1 - xsmaddasp vs32, vs0, vs8 + addi BO, BO, 32 - xsmaddasp vs33, vs0, vs9 + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 .endm @@ -7288,14 +5034,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 @@ -7310,14 +5051,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs33, alpha_r - + xsmuldp vs0, vs33, alpha_r #else - - xsmulsp vs28, vs33, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -7342,11 +5078,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 .endm @@ -7360,11 +5096,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7385,11 +5121,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7410,11 +5146,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 @@ -7446,11 +5182,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7471,11 +5207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7499,106 +5235,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -7622,11 +5270,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 .endm @@ -7638,11 +5286,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7659,11 +5307,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7680,11 +5328,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 @@ -7710,11 +5358,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7731,11 +5379,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7755,58 +5403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -7827,11 +5431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 .endm @@ -7842,11 +5446,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7861,11 +5465,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7880,11 +5484,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 @@ -7907,11 +5511,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7926,11 +5530,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7948,34 +5552,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -8000,7 +5582,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 .endm @@ -8016,11 +5599,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 .endm @@ -8037,11 +5621,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 .endm @@ -8058,11 +5643,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 .endm @@ -8070,8 +5656,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 .endm @@ -8088,11 +5674,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 .endm @@ -8109,11 +5696,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 .endm @@ -8130,17 +5718,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -8167,7 +5749,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 .endm @@ -8182,10 +5765,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 .endm @@ -8201,10 +5785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 .endm @@ -8220,10 +5805,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 .endm @@ -8231,7 +5817,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 .endm @@ -8247,10 +5833,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 .endm @@ -8266,10 +5853,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 .endm @@ -8285,14 +5873,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index 5e607c58f..f756d5d92 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -128,6 +128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 +#define alpha_vr vs31 #define o0 0 @@ -152,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PRE r30 #define T2 r31 -#include "sgemm_macros_16x8_power8.S" +#include "strmm_macros_16x8_power8.S" #ifndef NEEDPARAM @@ -264,11 +265,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 li PRE, 256 li o4 , 4 @@ -280,16 +281,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi TBUFFER, SP, 320 addi T1, SP, 300 - stfs f1, 0(T1) - - lxsspx alpha_r, 0, T1 + stxsspx f1, o0 , T1 + stxsspx f1, o4 , T1 + stxsspx f1, o8 , T1 + stxsspx f1, o12 , T1 + lxsspx alpha_r, o0, T1 + lxvw4x alpha_vr, o0, T1 #include "strmm_logic_16x8_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S index 8ec11f1ef..fb2d3f94b 100644 --- a/kernel/power/strmm_logic_16x8_power8.S +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -26,18 +26,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 3 - ble .LSTRMM_L8_END + ble STRMM_L8_END -.LSTRMM_L8_BEGIN: +STRMM_L8_BEGIN: mr CO, C mr AO, A @@ -49,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L8x16_END + ble STRMM_L8x16_END -.LSTRMM_L8x16_BEGIN: +STRMM_L8x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -78,11 +77,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x16_SUB0 + ble STRMM_L8x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x16_SUB4 + ble STRMM_L8x16_SUB4 -.LSTRMM_L8x16_LOOP_START: +STRMM_L8x16_LOOP_START: dcbt AO, PRE LOAD8x16_1 @@ -105,11 +104,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_2 addic. L, L, -2 - ble .LSTRMM_L8x16_LOOP_END + ble STRMM_L8x16_LOOP_END .align 5 -.LSTRMM_L8x16_LOOP: +STRMM_L8x16_LOOP: dcbt AO, PRE KERNEL8x16_1 @@ -130,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_2 addic. L, L, -1 - bgt .LSTRMM_L8x16_LOOP + bgt STRMM_L8x16_LOOP -.LSTRMM_L8x16_LOOP_END: +STRMM_L8x16_LOOP_END: dcbt AO, PRE KERNEL8x16_1 @@ -151,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_1 KERNEL8x16_E2 - b .LSTRMM_L8x16_SUB1 + b STRMM_L8x16_SUB1 -.LSTRMM_L8x16_SUB4: +STRMM_L8x16_SUB4: dcbt AO, PRE KERNEL8x16_SUBI1 @@ -169,31 +168,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_SUB1 KERNEL8x16_SUB1 - b .LSTRMM_L8x16_SUB1 + b STRMM_L8x16_SUB1 -.LSTRMM_L8x16_SUB0: +STRMM_L8x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x16_SAVE - b .LSTRMM_L8x16_SUB2 + ble STRMM_L8x16_SAVE + b STRMM_L8x16_SUB2 -.LSTRMM_L8x16_SUB1: +STRMM_L8x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x16_SAVE + ble STRMM_L8x16_SAVE -.LSTRMM_L8x16_SUB2: +STRMM_L8x16_SUB2: KERNEL8x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x16_SUB2 + bgt STRMM_L8x16_SUB2 -.LSTRMM_L8x16_SAVE: +STRMM_L8x16_SAVE: SAVE8x16 @@ -211,16 +210,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L8x16_BEGIN + bgt STRMM_L8x16_BEGIN -.LSTRMM_L8x16_END: +STRMM_L8x16_END: -.LSTRMM_L8x8_BEGIN: +STRMM_L8x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L8x1_END + ble STRMM_L8x1_END andi. T1, M, 8 - ble .LSTRMM_L8x8_END + ble STRMM_L8x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -246,11 +245,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x8_SUB0 + ble STRMM_L8x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x8_SUB4 + ble STRMM_L8x8_SUB4 -.LSTRMM_L8x8_LOOP_START: +STRMM_L8x8_LOOP_START: LOAD8x8_1 KERNEL8x8_I1 @@ -264,11 +263,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -2 - ble .LSTRMM_L8x8_LOOP_END + ble STRMM_L8x8_LOOP_END .align 5 -.LSTRMM_L8x8_LOOP: +STRMM_L8x8_LOOP: KERNEL8x8_1 KERNEL8x8_2 @@ -281,9 +280,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -1 - bgt .LSTRMM_L8x8_LOOP + bgt STRMM_L8x8_LOOP -.LSTRMM_L8x8_LOOP_END: +STRMM_L8x8_LOOP_END: KERNEL8x8_1 KERNEL8x8_2 @@ -295,9 +294,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_1 KERNEL8x8_E2 - b .LSTRMM_L8x8_SUB1 + b STRMM_L8x8_SUB1 -.LSTRMM_L8x8_SUB4: +STRMM_L8x8_SUB4: KERNEL8x8_SUBI1 KERNEL8x8_SUB1 @@ -309,31 +308,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_SUB1 KERNEL8x8_SUB1 - b .LSTRMM_L8x8_SUB1 + b STRMM_L8x8_SUB1 -.LSTRMM_L8x8_SUB0: +STRMM_L8x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x8_SAVE - b .LSTRMM_L8x8_SUB2 + ble STRMM_L8x8_SAVE + b STRMM_L8x8_SUB2 -.LSTRMM_L8x8_SUB1: +STRMM_L8x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x8_SAVE + ble STRMM_L8x8_SAVE -.LSTRMM_L8x8_SUB2: +STRMM_L8x8_SUB2: KERNEL8x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x8_SUB2 + bgt STRMM_L8x8_SUB2 -.LSTRMM_L8x8_SAVE: +STRMM_L8x8_SAVE: SAVE8x8 @@ -350,12 +349,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x8_END: +STRMM_L8x8_END: -.LSTRMM_L8x4_BEGIN: +STRMM_L8x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L8x4_END + ble STRMM_L8x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -381,11 +380,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x4_SUB0 + ble STRMM_L8x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x4_SUB4 + ble STRMM_L8x4_SUB4 -.LSTRMM_L8x4_LOOP_START: +STRMM_L8x4_LOOP_START: LOAD8x4_1 KERNEL8x4_I1 @@ -399,11 +398,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -2 - ble .LSTRMM_L8x4_LOOP_END + ble STRMM_L8x4_LOOP_END .align 5 -.LSTRMM_L8x4_LOOP: +STRMM_L8x4_LOOP: KERNEL8x4_1 KERNEL8x4_2 @@ -416,9 +415,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -1 - bgt .LSTRMM_L8x4_LOOP + bgt STRMM_L8x4_LOOP -.LSTRMM_L8x4_LOOP_END: +STRMM_L8x4_LOOP_END: KERNEL8x4_1 KERNEL8x4_2 @@ -430,9 +429,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_1 KERNEL8x4_E2 - b .LSTRMM_L8x4_SUB1 + b STRMM_L8x4_SUB1 -.LSTRMM_L8x4_SUB4: +STRMM_L8x4_SUB4: KERNEL8x4_SUBI1 KERNEL8x4_SUB1 @@ -444,31 +443,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_SUB1 KERNEL8x4_SUB1 - b .LSTRMM_L8x4_SUB1 + b STRMM_L8x4_SUB1 -.LSTRMM_L8x4_SUB0: +STRMM_L8x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x4_SAVE - b .LSTRMM_L8x4_SUB2 + ble STRMM_L8x4_SAVE + b STRMM_L8x4_SUB2 -.LSTRMM_L8x4_SUB1: +STRMM_L8x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x4_SAVE + ble STRMM_L8x4_SAVE -.LSTRMM_L8x4_SUB2: +STRMM_L8x4_SUB2: KERNEL8x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x4_SUB2 + bgt STRMM_L8x4_SUB2 -.LSTRMM_L8x4_SAVE: +STRMM_L8x4_SAVE: SAVE8x4 @@ -485,12 +484,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x4_END: +STRMM_L8x4_END: -.LSTRMM_L8x2_BEGIN: +STRMM_L8x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L8x2_END + ble STRMM_L8x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -516,11 +515,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x2_SUB0 + ble STRMM_L8x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x2_SUB4 + ble STRMM_L8x2_SUB4 -.LSTRMM_L8x2_LOOP_START: +STRMM_L8x2_LOOP_START: LOAD8x2_1 KERNEL8x2_I1 @@ -534,11 +533,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -2 - ble .LSTRMM_L8x2_LOOP_END + ble STRMM_L8x2_LOOP_END .align 5 -.LSTRMM_L8x2_LOOP: +STRMM_L8x2_LOOP: KERNEL8x2_1 KERNEL8x2_2 @@ -551,9 +550,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -1 - bgt .LSTRMM_L8x2_LOOP + bgt STRMM_L8x2_LOOP -.LSTRMM_L8x2_LOOP_END: +STRMM_L8x2_LOOP_END: KERNEL8x2_1 KERNEL8x2_2 @@ -565,9 +564,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_1 KERNEL8x2_E2 - b .LSTRMM_L8x2_SUB1 + b STRMM_L8x2_SUB1 -.LSTRMM_L8x2_SUB4: +STRMM_L8x2_SUB4: KERNEL8x2_SUBI1 KERNEL8x2_SUB1 @@ -579,31 +578,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_SUB1 KERNEL8x2_SUB1 - b .LSTRMM_L8x2_SUB1 + b STRMM_L8x2_SUB1 -.LSTRMM_L8x2_SUB0: +STRMM_L8x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x2_SAVE - b .LSTRMM_L8x2_SUB2 + ble STRMM_L8x2_SAVE + b STRMM_L8x2_SUB2 -.LSTRMM_L8x2_SUB1: +STRMM_L8x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x2_SAVE + ble STRMM_L8x2_SAVE -.LSTRMM_L8x2_SUB2: +STRMM_L8x2_SUB2: KERNEL8x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x2_SUB2 + bgt STRMM_L8x2_SUB2 -.LSTRMM_L8x2_SAVE: +STRMM_L8x2_SAVE: SAVE8x2 @@ -620,12 +619,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x2_END: +STRMM_L8x2_END: -.LSTRMM_L8x1_BEGIN: +STRMM_L8x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L8x1_END + ble STRMM_L8x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -651,11 +650,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x1_SUB0 + ble STRMM_L8x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x1_SUB4 + ble STRMM_L8x1_SUB4 -.LSTRMM_L8x1_LOOP_START: +STRMM_L8x1_LOOP_START: LOAD8x1_1 KERNEL8x1_I1 @@ -669,11 +668,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -2 - ble .LSTRMM_L8x1_LOOP_END + ble STRMM_L8x1_LOOP_END .align 5 -.LSTRMM_L8x1_LOOP: +STRMM_L8x1_LOOP: KERNEL8x1_1 KERNEL8x1_2 @@ -686,9 +685,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -1 - bgt .LSTRMM_L8x1_LOOP + bgt STRMM_L8x1_LOOP -.LSTRMM_L8x1_LOOP_END: +STRMM_L8x1_LOOP_END: KERNEL8x1_1 KERNEL8x1_2 @@ -700,9 +699,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_1 KERNEL8x1_E2 - b .LSTRMM_L8x1_SUB1 + b STRMM_L8x1_SUB1 -.LSTRMM_L8x1_SUB4: +STRMM_L8x1_SUB4: KERNEL8x1_SUBI1 KERNEL8x1_SUB1 @@ -714,31 +713,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_SUB1 KERNEL8x1_SUB1 - b .LSTRMM_L8x1_SUB1 + b STRMM_L8x1_SUB1 -.LSTRMM_L8x1_SUB0: +STRMM_L8x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x1_SAVE - b .LSTRMM_L8x1_SUB2 + ble STRMM_L8x1_SAVE + b STRMM_L8x1_SUB2 -.LSTRMM_L8x1_SUB1: +STRMM_L8x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x1_SAVE + ble STRMM_L8x1_SAVE -.LSTRMM_L8x1_SUB2: +STRMM_L8x1_SUB2: KERNEL8x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x1_SUB2 + bgt STRMM_L8x1_SUB2 -.LSTRMM_L8x1_SAVE: +STRMM_L8x1_SAVE: SAVE8x1 @@ -755,7 +754,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x1_END: +STRMM_L8x1_END: slwi T1, K, 5 add B, B, T1 @@ -766,23 +765,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. J, J, -1 - bgt .LSTRMM_L8_BEGIN + bgt STRMM_L8_BEGIN andi. T2, N, 7 - ble .L999 + ble L999 -.LSTRMM_L8_END: +STRMM_L8_END: - b .LSTRMM_L4_BEGIN + b STRMM_L4_BEGIN -.L999_H1: +L999_H1: - b .L999 + b L999 -.LSTRMM_L4_BEGIN: +STRMM_L4_BEGIN: andi. T1, N, 4 - ble .LSTRMM_L4_END + ble STRMM_L4_END mr CO, C mr AO, A slwi T1, LDC , 2 @@ -793,9 +792,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L4x16_END + ble STRMM_L4x16_END -.LSTRMM_L4x16_BEGIN: +STRMM_L4x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -822,11 +821,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x16_SUB0 + ble STRMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x16_SUB4 + ble STRMM_L4x16_SUB4 -.LSTRMM_L4x16_LOOP_START: +STRMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -849,11 +848,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -2 - ble .LSTRMM_L4x16_LOOP_END + ble STRMM_L4x16_LOOP_END .align 5 -.LSTRMM_L4x16_LOOP: +STRMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 @@ -874,9 +873,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -1 - bgt .LSTRMM_L4x16_LOOP + bgt STRMM_L4x16_LOOP -.LSTRMM_L4x16_LOOP_END: +STRMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 @@ -895,9 +894,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_1 KERNEL4x16_E2 - b .LSTRMM_L4x16_SUB1 + b STRMM_L4x16_SUB1 -.LSTRMM_L4x16_SUB4: +STRMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -913,31 +912,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b .LSTRMM_L4x16_SUB1 + b STRMM_L4x16_SUB1 -.LSTRMM_L4x16_SUB0: +STRMM_L4x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x16_SAVE - b .LSTRMM_L4x16_SUB2 + ble STRMM_L4x16_SAVE + b STRMM_L4x16_SUB2 -.LSTRMM_L4x16_SUB1: +STRMM_L4x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x16_SAVE + ble STRMM_L4x16_SAVE -.LSTRMM_L4x16_SUB2: +STRMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x16_SUB2 + bgt STRMM_L4x16_SUB2 -.LSTRMM_L4x16_SAVE: +STRMM_L4x16_SAVE: SAVE4x16 @@ -955,16 +954,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L4x16_BEGIN + bgt STRMM_L4x16_BEGIN -.LSTRMM_L4x16_END: +STRMM_L4x16_END: -.LSTRMM_L4x8_BEGIN: +STRMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L4x1_END + ble STRMM_L4x1_END andi. T1, M, 8 - ble .LSTRMM_L4x8_END + ble STRMM_L4x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -990,11 +989,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x8_SUB0 + ble STRMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x8_SUB4 + ble STRMM_L4x8_SUB4 -.LSTRMM_L4x8_LOOP_START: +STRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -1008,11 +1007,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LSTRMM_L4x8_LOOP_END + ble STRMM_L4x8_LOOP_END .align 5 -.LSTRMM_L4x8_LOOP: +STRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -1025,9 +1024,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LSTRMM_L4x8_LOOP + bgt STRMM_L4x8_LOOP -.LSTRMM_L4x8_LOOP_END: +STRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -1039,9 +1038,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LSTRMM_L4x8_SUB1 + b STRMM_L4x8_SUB1 -.LSTRMM_L4x8_SUB4: +STRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -1053,31 +1052,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LSTRMM_L4x8_SUB1 + b STRMM_L4x8_SUB1 -.LSTRMM_L4x8_SUB0: +STRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x8_SAVE - b .LSTRMM_L4x8_SUB2 + ble STRMM_L4x8_SAVE + b STRMM_L4x8_SUB2 -.LSTRMM_L4x8_SUB1: +STRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x8_SAVE + ble STRMM_L4x8_SAVE -.LSTRMM_L4x8_SUB2: +STRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x8_SUB2 + bgt STRMM_L4x8_SUB2 -.LSTRMM_L4x8_SAVE: +STRMM_L4x8_SAVE: SAVE4x8 @@ -1094,12 +1093,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x8_END: +STRMM_L4x8_END: -.LSTRMM_L4x4_BEGIN: +STRMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L4x4_END + ble STRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1125,11 +1124,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x4_SUB0 + ble STRMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x4_SUB4 + ble STRMM_L4x4_SUB4 -.LSTRMM_L4x4_LOOP_START: +STRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -1143,11 +1142,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LSTRMM_L4x4_LOOP_END + ble STRMM_L4x4_LOOP_END .align 5 -.LSTRMM_L4x4_LOOP: +STRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -1160,9 +1159,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LSTRMM_L4x4_LOOP + bgt STRMM_L4x4_LOOP -.LSTRMM_L4x4_LOOP_END: +STRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -1174,9 +1173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LSTRMM_L4x4_SUB1 + b STRMM_L4x4_SUB1 -.LSTRMM_L4x4_SUB4: +STRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -1188,31 +1187,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LSTRMM_L4x4_SUB1 + b STRMM_L4x4_SUB1 -.LSTRMM_L4x4_SUB0: +STRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x4_SAVE - b .LSTRMM_L4x4_SUB2 + ble STRMM_L4x4_SAVE + b STRMM_L4x4_SUB2 -.LSTRMM_L4x4_SUB1: +STRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x4_SAVE + ble STRMM_L4x4_SAVE -.LSTRMM_L4x4_SUB2: +STRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x4_SUB2 + bgt STRMM_L4x4_SUB2 -.LSTRMM_L4x4_SAVE: +STRMM_L4x4_SAVE: SAVE4x4 @@ -1229,12 +1228,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x4_END: +STRMM_L4x4_END: -.LSTRMM_L4x2_BEGIN: +STRMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L4x2_END + ble STRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1260,11 +1259,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x2_SUB0 + ble STRMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x2_SUB4 + ble STRMM_L4x2_SUB4 -.LSTRMM_L4x2_LOOP_START: +STRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -1278,11 +1277,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LSTRMM_L4x2_LOOP_END + ble STRMM_L4x2_LOOP_END .align 5 -.LSTRMM_L4x2_LOOP: +STRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -1295,9 +1294,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LSTRMM_L4x2_LOOP + bgt STRMM_L4x2_LOOP -.LSTRMM_L4x2_LOOP_END: +STRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -1309,9 +1308,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LSTRMM_L4x2_SUB1 + b STRMM_L4x2_SUB1 -.LSTRMM_L4x2_SUB4: +STRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -1323,31 +1322,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LSTRMM_L4x2_SUB1 + b STRMM_L4x2_SUB1 -.LSTRMM_L4x2_SUB0: +STRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x2_SAVE - b .LSTRMM_L4x2_SUB2 + ble STRMM_L4x2_SAVE + b STRMM_L4x2_SUB2 -.LSTRMM_L4x2_SUB1: +STRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x2_SAVE + ble STRMM_L4x2_SAVE -.LSTRMM_L4x2_SUB2: +STRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x2_SUB2 + bgt STRMM_L4x2_SUB2 -.LSTRMM_L4x2_SAVE: +STRMM_L4x2_SAVE: SAVE4x2 @@ -1364,12 +1363,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x2_END: +STRMM_L4x2_END: -.LSTRMM_L4x1_BEGIN: +STRMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L4x1_END + ble STRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1395,11 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x1_SUB0 + ble STRMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x1_SUB4 + ble STRMM_L4x1_SUB4 -.LSTRMM_L4x1_LOOP_START: +STRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -1413,11 +1412,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LSTRMM_L4x1_LOOP_END + ble STRMM_L4x1_LOOP_END .align 5 -.LSTRMM_L4x1_LOOP: +STRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -1430,9 +1429,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LSTRMM_L4x1_LOOP + bgt STRMM_L4x1_LOOP -.LSTRMM_L4x1_LOOP_END: +STRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -1444,9 +1443,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LSTRMM_L4x1_SUB1 + b STRMM_L4x1_SUB1 -.LSTRMM_L4x1_SUB4: +STRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -1458,31 +1457,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LSTRMM_L4x1_SUB1 + b STRMM_L4x1_SUB1 -.LSTRMM_L4x1_SUB0: +STRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x1_SAVE - b .LSTRMM_L4x1_SUB2 + ble STRMM_L4x1_SAVE + b STRMM_L4x1_SUB2 -.LSTRMM_L4x1_SUB1: +STRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x1_SAVE + ble STRMM_L4x1_SAVE -.LSTRMM_L4x1_SUB2: +STRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x1_SUB2 + bgt STRMM_L4x1_SUB2 -.LSTRMM_L4x1_SAVE: +STRMM_L4x1_SAVE: SAVE4x1 @@ -1499,7 +1498,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x1_END: +STRMM_L4x1_END: slwi T1, K, 4 add B, B, T1 @@ -1509,11 +1508,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4_END: -.LSTRMM_L2_BEGIN: +STRMM_L4_END: +STRMM_L2_BEGIN: andi. T1, N, 2 - ble .LSTRMM_L2_END + ble STRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 @@ -1524,9 +1523,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L2x16_END + ble STRMM_L2x16_END -.LSTRMM_L2x16_BEGIN: +STRMM_L2x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1553,11 +1552,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x16_SUB0 + ble STRMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x16_SUB4 + ble STRMM_L2x16_SUB4 -.LSTRMM_L2x16_LOOP_START: +STRMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -1580,11 +1579,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LSTRMM_L2x16_LOOP_END + ble STRMM_L2x16_LOOP_END .align 5 -.LSTRMM_L2x16_LOOP: +STRMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -1605,9 +1604,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LSTRMM_L2x16_LOOP + bgt STRMM_L2x16_LOOP -.LSTRMM_L2x16_LOOP_END: +STRMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -1626,9 +1625,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LSTRMM_L2x16_SUB1 + b STRMM_L2x16_SUB1 -.LSTRMM_L2x16_SUB4: +STRMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -1644,31 +1643,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LSTRMM_L2x16_SUB1 + b STRMM_L2x16_SUB1 -.LSTRMM_L2x16_SUB0: +STRMM_L2x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x16_SAVE - b .LSTRMM_L2x16_SUB2 + ble STRMM_L2x16_SAVE + b STRMM_L2x16_SUB2 -.LSTRMM_L2x16_SUB1: +STRMM_L2x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x16_SAVE + ble STRMM_L2x16_SAVE -.LSTRMM_L2x16_SUB2: +STRMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x16_SUB2 + bgt STRMM_L2x16_SUB2 -.LSTRMM_L2x16_SAVE: +STRMM_L2x16_SAVE: SAVE2x16 @@ -1686,16 +1685,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L2x16_BEGIN + bgt STRMM_L2x16_BEGIN -.LSTRMM_L2x16_END: +STRMM_L2x16_END: -.LSTRMM_L2x8_BEGIN: +STRMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L2x1_END + ble STRMM_L2x1_END andi. T1, M, 8 - ble .LSTRMM_L2x8_END + ble STRMM_L2x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1721,11 +1720,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x8_SUB0 + ble STRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x8_SUB4 + ble STRMM_L2x8_SUB4 -.LSTRMM_L2x8_LOOP_START: +STRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -1739,11 +1738,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LSTRMM_L2x8_LOOP_END + ble STRMM_L2x8_LOOP_END .align 5 -.LSTRMM_L2x8_LOOP: +STRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -1756,9 +1755,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LSTRMM_L2x8_LOOP + bgt STRMM_L2x8_LOOP -.LSTRMM_L2x8_LOOP_END: +STRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -1770,9 +1769,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LSTRMM_L2x8_SUB1 + b STRMM_L2x8_SUB1 -.LSTRMM_L2x8_SUB4: +STRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -1784,31 +1783,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LSTRMM_L2x8_SUB1 + b STRMM_L2x8_SUB1 -.LSTRMM_L2x8_SUB0: +STRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x8_SAVE - b .LSTRMM_L2x8_SUB2 + ble STRMM_L2x8_SAVE + b STRMM_L2x8_SUB2 -.LSTRMM_L2x8_SUB1: +STRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x8_SAVE + ble STRMM_L2x8_SAVE -.LSTRMM_L2x8_SUB2: +STRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x8_SUB2 + bgt STRMM_L2x8_SUB2 -.LSTRMM_L2x8_SAVE: +STRMM_L2x8_SAVE: SAVE2x8 @@ -1825,12 +1824,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x8_END: +STRMM_L2x8_END: -.LSTRMM_L2x4_BEGIN: +STRMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L2x4_END + ble STRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1856,11 +1855,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x4_SUB0 + ble STRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x4_SUB4 + ble STRMM_L2x4_SUB4 -.LSTRMM_L2x4_LOOP_START: +STRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -1874,11 +1873,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LSTRMM_L2x4_LOOP_END + ble STRMM_L2x4_LOOP_END .align 5 -.LSTRMM_L2x4_LOOP: +STRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -1891,9 +1890,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LSTRMM_L2x4_LOOP + bgt STRMM_L2x4_LOOP -.LSTRMM_L2x4_LOOP_END: +STRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -1905,9 +1904,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LSTRMM_L2x4_SUB1 + b STRMM_L2x4_SUB1 -.LSTRMM_L2x4_SUB4: +STRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -1919,31 +1918,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LSTRMM_L2x4_SUB1 + b STRMM_L2x4_SUB1 -.LSTRMM_L2x4_SUB0: +STRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x4_SAVE - b .LSTRMM_L2x4_SUB2 + ble STRMM_L2x4_SAVE + b STRMM_L2x4_SUB2 -.LSTRMM_L2x4_SUB1: +STRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x4_SAVE + ble STRMM_L2x4_SAVE -.LSTRMM_L2x4_SUB2: +STRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x4_SUB2 + bgt STRMM_L2x4_SUB2 -.LSTRMM_L2x4_SAVE: +STRMM_L2x4_SAVE: SAVE2x4 @@ -1960,12 +1959,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x4_END: +STRMM_L2x4_END: -.LSTRMM_L2x2_BEGIN: +STRMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L2x2_END + ble STRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1991,11 +1990,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x2_SUB0 + ble STRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x2_SUB4 + ble STRMM_L2x2_SUB4 -.LSTRMM_L2x2_LOOP_START: +STRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -2009,11 +2008,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LSTRMM_L2x2_LOOP_END + ble STRMM_L2x2_LOOP_END .align 5 -.LSTRMM_L2x2_LOOP: +STRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -2026,9 +2025,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LSTRMM_L2x2_LOOP + bgt STRMM_L2x2_LOOP -.LSTRMM_L2x2_LOOP_END: +STRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -2040,9 +2039,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LSTRMM_L2x2_SUB1 + b STRMM_L2x2_SUB1 -.LSTRMM_L2x2_SUB4: +STRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -2054,31 +2053,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LSTRMM_L2x2_SUB1 + b STRMM_L2x2_SUB1 -.LSTRMM_L2x2_SUB0: +STRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x2_SAVE - b .LSTRMM_L2x2_SUB2 + ble STRMM_L2x2_SAVE + b STRMM_L2x2_SUB2 -.LSTRMM_L2x2_SUB1: +STRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x2_SAVE + ble STRMM_L2x2_SAVE -.LSTRMM_L2x2_SUB2: +STRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x2_SUB2 + bgt STRMM_L2x2_SUB2 -.LSTRMM_L2x2_SAVE: +STRMM_L2x2_SAVE: SAVE2x2 @@ -2095,12 +2094,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x2_END: +STRMM_L2x2_END: -.LSTRMM_L2x1_BEGIN: +STRMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L2x1_END + ble STRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2126,11 +2125,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x1_SUB0 + ble STRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x1_SUB4 + ble STRMM_L2x1_SUB4 -.LSTRMM_L2x1_LOOP_START: +STRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -2144,11 +2143,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LSTRMM_L2x1_LOOP_END + ble STRMM_L2x1_LOOP_END .align 5 -.LSTRMM_L2x1_LOOP: +STRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -2161,9 +2160,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LSTRMM_L2x1_LOOP + bgt STRMM_L2x1_LOOP -.LSTRMM_L2x1_LOOP_END: +STRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -2175,9 +2174,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LSTRMM_L2x1_SUB1 + b STRMM_L2x1_SUB1 -.LSTRMM_L2x1_SUB4: +STRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -2189,31 +2188,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LSTRMM_L2x1_SUB1 + b STRMM_L2x1_SUB1 -.LSTRMM_L2x1_SUB0: +STRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x1_SAVE - b .LSTRMM_L2x1_SUB2 + ble STRMM_L2x1_SAVE + b STRMM_L2x1_SUB2 -.LSTRMM_L2x1_SUB1: +STRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x1_SAVE + ble STRMM_L2x1_SAVE -.LSTRMM_L2x1_SUB2: +STRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x1_SUB2 + bgt STRMM_L2x1_SUB2 -.LSTRMM_L2x1_SAVE: +STRMM_L2x1_SAVE: SAVE2x1 @@ -2230,7 +2229,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x1_END: +STRMM_L2x1_END: slwi T1, K, 3 add B, B, T1 @@ -2240,11 +2239,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2_END: -.LSTRMM_L1_BEGIN: +STRMM_L2_END: +STRMM_L1_BEGIN: andi. T1, N, 1 - ble .LSTRMM_L1_END + ble STRMM_L1_END mr CO, C mr AO, A @@ -2253,9 +2252,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L1x16_END + ble STRMM_L1x16_END -.LSTRMM_L1x16_BEGIN: +STRMM_L1x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2282,11 +2281,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x16_SUB0 + ble STRMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x16_SUB4 + ble STRMM_L1x16_SUB4 -.LSTRMM_L1x16_LOOP_START: +STRMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -2309,11 +2308,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LSTRMM_L1x16_LOOP_END + ble STRMM_L1x16_LOOP_END .align 5 -.LSTRMM_L1x16_LOOP: +STRMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -2334,9 +2333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LSTRMM_L1x16_LOOP + bgt STRMM_L1x16_LOOP -.LSTRMM_L1x16_LOOP_END: +STRMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -2355,9 +2354,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LSTRMM_L1x16_SUB1 + b STRMM_L1x16_SUB1 -.LSTRMM_L1x16_SUB4: +STRMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -2373,31 +2372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LSTRMM_L1x16_SUB1 + b STRMM_L1x16_SUB1 -.LSTRMM_L1x16_SUB0: +STRMM_L1x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x16_SAVE - b .LSTRMM_L1x16_SUB2 + ble STRMM_L1x16_SAVE + b STRMM_L1x16_SUB2 -.LSTRMM_L1x16_SUB1: +STRMM_L1x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x16_SAVE + ble STRMM_L1x16_SAVE -.LSTRMM_L1x16_SUB2: +STRMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x16_SUB2 + bgt STRMM_L1x16_SUB2 -.LSTRMM_L1x16_SAVE: +STRMM_L1x16_SAVE: SAVE1x16 @@ -2415,16 +2414,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L1x16_BEGIN + bgt STRMM_L1x16_BEGIN -.LSTRMM_L1x16_END: +STRMM_L1x16_END: -.LSTRMM_L1x8_BEGIN: +STRMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L1x1_END + ble STRMM_L1x1_END andi. T1, M, 8 - ble .LSTRMM_L1x8_END + ble STRMM_L1x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2450,11 +2449,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x8_SUB0 + ble STRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x8_SUB4 + ble STRMM_L1x8_SUB4 -.LSTRMM_L1x8_LOOP_START: +STRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -2468,11 +2467,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LSTRMM_L1x8_LOOP_END + ble STRMM_L1x8_LOOP_END .align 5 -.LSTRMM_L1x8_LOOP: +STRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -2485,9 +2484,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LSTRMM_L1x8_LOOP + bgt STRMM_L1x8_LOOP -.LSTRMM_L1x8_LOOP_END: +STRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -2499,9 +2498,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LSTRMM_L1x8_SUB1 + b STRMM_L1x8_SUB1 -.LSTRMM_L1x8_SUB4: +STRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -2513,31 +2512,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LSTRMM_L1x8_SUB1 + b STRMM_L1x8_SUB1 -.LSTRMM_L1x8_SUB0: +STRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x8_SAVE - b .LSTRMM_L1x8_SUB2 + ble STRMM_L1x8_SAVE + b STRMM_L1x8_SUB2 -.LSTRMM_L1x8_SUB1: +STRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x8_SAVE + ble STRMM_L1x8_SAVE -.LSTRMM_L1x8_SUB2: +STRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x8_SUB2 + bgt STRMM_L1x8_SUB2 -.LSTRMM_L1x8_SAVE: +STRMM_L1x8_SAVE: SAVE1x8 @@ -2554,12 +2553,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x8_END: +STRMM_L1x8_END: -.LSTRMM_L1x4_BEGIN: +STRMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L1x4_END + ble STRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2585,11 +2584,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x4_SUB0 + ble STRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x4_SUB4 + ble STRMM_L1x4_SUB4 -.LSTRMM_L1x4_LOOP_START: +STRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -2603,11 +2602,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LSTRMM_L1x4_LOOP_END + ble STRMM_L1x4_LOOP_END .align 5 -.LSTRMM_L1x4_LOOP: +STRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -2620,9 +2619,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LSTRMM_L1x4_LOOP + bgt STRMM_L1x4_LOOP -.LSTRMM_L1x4_LOOP_END: +STRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -2634,9 +2633,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LSTRMM_L1x4_SUB1 + b STRMM_L1x4_SUB1 -.LSTRMM_L1x4_SUB4: +STRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -2648,31 +2647,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LSTRMM_L1x4_SUB1 + b STRMM_L1x4_SUB1 -.LSTRMM_L1x4_SUB0: +STRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x4_SAVE - b .LSTRMM_L1x4_SUB2 + ble STRMM_L1x4_SAVE + b STRMM_L1x4_SUB2 -.LSTRMM_L1x4_SUB1: +STRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x4_SAVE + ble STRMM_L1x4_SAVE -.LSTRMM_L1x4_SUB2: +STRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x4_SUB2 + bgt STRMM_L1x4_SUB2 -.LSTRMM_L1x4_SAVE: +STRMM_L1x4_SAVE: SAVE1x4 @@ -2689,12 +2688,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x4_END: +STRMM_L1x4_END: -.LSTRMM_L1x2_BEGIN: +STRMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L1x2_END + ble STRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2720,11 +2719,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x2_SUB0 + ble STRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x2_SUB4 + ble STRMM_L1x2_SUB4 -.LSTRMM_L1x2_LOOP_START: +STRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -2738,11 +2737,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LSTRMM_L1x2_LOOP_END + ble STRMM_L1x2_LOOP_END .align 5 -.LSTRMM_L1x2_LOOP: +STRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -2755,9 +2754,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LSTRMM_L1x2_LOOP + bgt STRMM_L1x2_LOOP -.LSTRMM_L1x2_LOOP_END: +STRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -2769,9 +2768,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LSTRMM_L1x2_SUB1 + b STRMM_L1x2_SUB1 -.LSTRMM_L1x2_SUB4: +STRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -2783,31 +2782,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LSTRMM_L1x2_SUB1 + b STRMM_L1x2_SUB1 -.LSTRMM_L1x2_SUB0: +STRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x2_SAVE - b .LSTRMM_L1x2_SUB2 + ble STRMM_L1x2_SAVE + b STRMM_L1x2_SUB2 -.LSTRMM_L1x2_SUB1: +STRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x2_SAVE + ble STRMM_L1x2_SAVE -.LSTRMM_L1x2_SUB2: +STRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x2_SUB2 + bgt STRMM_L1x2_SUB2 -.LSTRMM_L1x2_SAVE: +STRMM_L1x2_SAVE: SAVE1x2 @@ -2824,12 +2823,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x2_END: +STRMM_L1x2_END: -.LSTRMM_L1x1_BEGIN: +STRMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L1x1_END + ble STRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2855,11 +2854,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x1_SUB0 + ble STRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x1_SUB4 + ble STRMM_L1x1_SUB4 -.LSTRMM_L1x1_LOOP_START: +STRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -2873,11 +2872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LSTRMM_L1x1_LOOP_END + ble STRMM_L1x1_LOOP_END .align 5 -.LSTRMM_L1x1_LOOP: +STRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -2890,9 +2889,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LSTRMM_L1x1_LOOP + bgt STRMM_L1x1_LOOP -.LSTRMM_L1x1_LOOP_END: +STRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -2904,9 +2903,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LSTRMM_L1x1_SUB1 + b STRMM_L1x1_SUB1 -.LSTRMM_L1x1_SUB4: +STRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -2918,31 +2917,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LSTRMM_L1x1_SUB1 + b STRMM_L1x1_SUB1 -.LSTRMM_L1x1_SUB0: +STRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x1_SAVE - b .LSTRMM_L1x1_SUB2 + ble STRMM_L1x1_SAVE + b STRMM_L1x1_SUB2 -.LSTRMM_L1x1_SUB1: +STRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x1_SAVE + ble STRMM_L1x1_SAVE -.LSTRMM_L1x1_SUB2: +STRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x1_SUB2 + bgt STRMM_L1x1_SUB2 -.LSTRMM_L1x1_SAVE: +STRMM_L1x1_SAVE: SAVE1x1 @@ -2959,11 +2958,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x1_END: +STRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -.LSTRMM_L1_END: +STRMM_L1_END: diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S new file mode 100644 index 000000000..27bc1e89c --- /dev/null +++ b/kernel/power/strmm_macros_16x8_power8.S @@ -0,0 +1,5840 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro SAVE8x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr +#else + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr +#else + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr +#else + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr +#else + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr +#else + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr +#else + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs37, alpha_vr +#else + xvmaddasp vs0, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs39, alpha_vr +#else + xvmaddasp vs0, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + +.macro LOAD8x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs40, alpha_r + xsmuldp vs1, vs41, alpha_r +#else + xsmaddadp vs0, vs40, alpha_r + xsmaddadp vs1, vs41, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs42, alpha_r + xsmuldp vs1, vs43, alpha_r +#else + xsmaddadp vs0, vs42, alpha_r + xsmaddadp vs1, vs43, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs44, alpha_r + xsmuldp vs1, vs45, alpha_r +#else + xsmaddadp vs0, vs44, alpha_r + xsmaddadp vs1, vs45, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs46, alpha_r + xsmuldp vs1, vs47, alpha_r +#else + xsmaddadp vs0, vs46, alpha_r + xsmaddadp vs1, vs47, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ + +.macro LOAD8x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs37, alpha_r +#else + xsmaddadp vs0, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs39, alpha_r +#else + xsmaddadp vs0, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro SAVE4x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro LOAD2x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro SAVE2x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro LOAD1x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro SAVE1x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + diff --git a/param.h b/param.h index 370d10b9a..fb344cd33 100644 --- a/param.h +++ b/param.h @@ -1964,7 +1964,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 384 +#define GEMM_DEFAULT_OFFSET_A 131072 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL @@ -1977,17 +1977,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 480 +#define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 240 -#define SGEMM_DEFAULT_Q 1440 +#define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 360 -#define SGEMM_DEFAULT_R 28800 +#define SGEMM_DEFAULT_R 14400 #define DGEMM_DEFAULT_R 14400 #define CGEMM_DEFAULT_R 14400 #define ZGEMM_DEFAULT_R 7200