From d4c0330967f13ce916da41391bc1ccf383c34b5b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 3 Apr 2016 14:30:49 +0200 Subject: [PATCH] updated cgemm- and ctrmm-kernel for POWER8 --- kernel/power/cgemm_kernel_8x4_power8.S | 46 +- kernel/power/cgemm_logic_8x4_power8.S | 558 ++-- kernel/power/cgemm_macros_8x4_power8.S | 4077 ++++++++++++------------ kernel/power/ctrmm_kernel_8x4_power8.S | 44 +- kernel/power/ctrmm_logic_8x4_power8.S | 555 ++-- 5 files changed, 2698 insertions(+), 2582 deletions(-) diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index f732c8132..a7e706699 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -130,10 +130,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 -#define TBUFFER r14 +#define alpha_dr vs28 +#define alpha_di vs29 +#define alpha_sr vs30 +#define alpha_si vs31 + + +#define NOTUSED r14 #define L r15 #define o12 r16 #define o4 r17 @@ -271,21 +275,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_8x4_power8.S" cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 256 + li PRE, 384 li o4 , 4 li o8 , 8 li o12 , 12 li o16 , 16 li o32 , 32 li o48 , 48 - addi TBUFFER, SP, 360 #ifdef __64BIT__ @@ -294,14 +297,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi T1 , SP, 224 #endif - lxsspx alpha_r, 0, T1 - lxsspx alpha_i, o8, T1 + stxsspx vs1, 0, T1 + lxsspx alpha_dr, 0, T1 + stxsspx vs2, o8 , T1 + lxsspx alpha_di, o8, T1 + addi T1, SP, 360 + li T2, 0 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_dr, o12, T1 + lxvw4x alpha_sr, o0 , T1 + addi T1, T1, 16 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_di, o12, T1 + lxvw4x alpha_si, o0 , T1 .align 5 #include "cgemm_logic_8x4_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S index 51a063126..851a09aaa 100644 --- a/kernel/power/cgemm_logic_8x4_power8.S +++ b/kernel/power/cgemm_logic_8x4_power8.S @@ -26,38 +26,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 2 - ble .LCGEMM_L4_END + ble CGEMM_L4_END -.LCGEMM_L4_BEGIN: +CGEMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 3 - ble .LCGEMM_L4x8_END + ble CGEMM_L4x8_END -.LCGEMM_L4x8_BEGIN: +CGEMM_L4x8_BEGIN: mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x8_SUB0 + ble CGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x8_SUB4 + ble CGEMM_L4x8_SUB4 -.LCGEMM_L4x8_LOOP_START: +CGEMM_L4x8_LOOP_START: dcbt AO, PRE + dcbt BO, PRE LOAD4x8_1 KERNEL4x8_I1 dcbt AO, PRE @@ -68,17 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 - ble .LCGEMM_L4x8_LOOP_END + ble CGEMM_L4x8_LOOP_END .align 5 -.LCGEMM_L4x8_LOOP: +CGEMM_L4x8_LOOP: KERNEL4x8_1 dcbt AO, PRE @@ -89,15 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 - bgt .LCGEMM_L4x8_LOOP + bgt CGEMM_L4x8_LOOP -.LCGEMM_L4x8_LOOP_END: +CGEMM_L4x8_LOOP_END: KERNEL4x8_1 dcbt AO, PRE @@ -112,9 +114,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LCGEMM_L4x8_SUB1 + b CGEMM_L4x8_SUB1 -.LCGEMM_L4x8_SUB4: +CGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -126,53 +128,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LCGEMM_L4x8_SUB1 + b CGEMM_L4x8_SUB1 -.LCGEMM_L4x8_SUB0: +CGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x8_SAVE - b .LCGEMM_L4x8_SUB2 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 -.LCGEMM_L4x8_SUB1: +CGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x8_SAVE + ble CGEMM_L4x8_SAVE -.LCGEMM_L4x8_SUB2: +CGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x8_SUB2 + bgt CGEMM_L4x8_SUB2 -.LCGEMM_L4x8_SAVE: +CGEMM_L4x8_SAVE: SAVE4x8 addic. I, I, -1 - bgt .LCGEMM_L4x8_BEGIN + bgt CGEMM_L4x8_BEGIN -.LCGEMM_L4x8_END: +CGEMM_L4x8_END: -.LCGEMM_L4x4_BEGIN: +CGEMM_L4x4_BEGIN: andi. T2, M, 7 - ble .LCGEMM_L4x1_END + ble CGEMM_L4x1_END andi. T1, M, 4 - ble .LCGEMM_L4x4_END + ble CGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x4_SUB0 + ble CGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x4_SUB4 + ble CGEMM_L4x4_SUB4 -.LCGEMM_L4x4_LOOP_START: +CGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -186,11 +188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LCGEMM_L4x4_LOOP_END + ble CGEMM_L4x4_LOOP_END .align 5 -.LCGEMM_L4x4_LOOP: +CGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -203,9 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LCGEMM_L4x4_LOOP + bgt CGEMM_L4x4_LOOP -.LCGEMM_L4x4_LOOP_END: +CGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -217,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LCGEMM_L4x4_SUB1 + b CGEMM_L4x4_SUB1 -.LCGEMM_L4x4_SUB4: +CGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -231,48 +233,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LCGEMM_L4x4_SUB1 + b CGEMM_L4x4_SUB1 -.LCGEMM_L4x4_SUB0: +CGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x4_SAVE - b .LCGEMM_L4x4_SUB2 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 -.LCGEMM_L4x4_SUB1: +CGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x4_SAVE + ble CGEMM_L4x4_SAVE -.LCGEMM_L4x4_SUB2: +CGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x4_SUB2 + bgt CGEMM_L4x4_SUB2 -.LCGEMM_L4x4_SAVE: +CGEMM_L4x4_SAVE: SAVE4x4 -.LCGEMM_L4x4_END: +CGEMM_L4x4_END: -.LCGEMM_L4x2_BEGIN: +CGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LCGEMM_L4x2_END + ble CGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x2_SUB0 + ble CGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x2_SUB4 + ble CGEMM_L4x2_SUB4 -.LCGEMM_L4x2_LOOP_START: +CGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -286,11 +288,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LCGEMM_L4x2_LOOP_END + ble CGEMM_L4x2_LOOP_END .align 5 -.LCGEMM_L4x2_LOOP: +CGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -303,9 +305,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LCGEMM_L4x2_LOOP + bgt CGEMM_L4x2_LOOP -.LCGEMM_L4x2_LOOP_END: +CGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -317,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LCGEMM_L4x2_SUB1 + b CGEMM_L4x2_SUB1 -.LCGEMM_L4x2_SUB4: +CGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -331,48 +333,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LCGEMM_L4x2_SUB1 + b CGEMM_L4x2_SUB1 -.LCGEMM_L4x2_SUB0: +CGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x2_SAVE - b .LCGEMM_L4x2_SUB2 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 -.LCGEMM_L4x2_SUB1: +CGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x2_SAVE + ble CGEMM_L4x2_SAVE -.LCGEMM_L4x2_SUB2: +CGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x2_SUB2 + bgt CGEMM_L4x2_SUB2 -.LCGEMM_L4x2_SAVE: +CGEMM_L4x2_SAVE: SAVE4x2 -.LCGEMM_L4x2_END: +CGEMM_L4x2_END: -.LCGEMM_L4x1_BEGIN: +CGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LCGEMM_L4x1_END + ble CGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x1_SUB0 + ble CGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x1_SUB4 + ble CGEMM_L4x1_SUB4 -.LCGEMM_L4x1_LOOP_START: +CGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -386,11 +388,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LCGEMM_L4x1_LOOP_END + ble CGEMM_L4x1_LOOP_END .align 5 -.LCGEMM_L4x1_LOOP: +CGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -403,9 +405,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LCGEMM_L4x1_LOOP + bgt CGEMM_L4x1_LOOP -.LCGEMM_L4x1_LOOP_END: +CGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -417,9 +419,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LCGEMM_L4x1_SUB1 + b CGEMM_L4x1_SUB1 -.LCGEMM_L4x1_SUB4: +CGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -431,74 +433,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LCGEMM_L4x1_SUB1 + b CGEMM_L4x1_SUB1 -.LCGEMM_L4x1_SUB0: +CGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x1_SAVE - b .LCGEMM_L4x1_SUB2 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 -.LCGEMM_L4x1_SUB1: +CGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x1_SAVE + ble CGEMM_L4x1_SAVE -.LCGEMM_L4x1_SUB2: +CGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x1_SUB2 + bgt CGEMM_L4x1_SUB2 -.LCGEMM_L4x1_SAVE: +CGEMM_L4x1_SAVE: SAVE4x1 -.LCGEMM_L4x1_END: +CGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LCGEMM_L4_BEGIN + bgt CGEMM_L4_BEGIN andi. T2, N, 3 - ble .L999_H2 + ble L999_H2 -.LCGEMM_L4_END: +CGEMM_L4_END: - b .LCGEMM_L2_BEGIN + b CGEMM_L2_BEGIN -.L999_H1: +L999_H1: - b .L999_H2 + b L999_H2 -.LCGEMM_L2_BEGIN: +CGEMM_L2_BEGIN: andi. T1, N, 2 - ble .LCGEMM_L2_END + ble CGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 - ble .LCGEMM_L2x8_END + ble CGEMM_L2x8_END -.LCGEMM_L2x8_BEGIN: +CGEMM_L2x8_BEGIN: mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x8_SUB0 + ble CGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x8_SUB4 + ble CGEMM_L2x8_SUB4 -.LCGEMM_L2x8_LOOP_START: +CGEMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 @@ -517,11 +519,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LCGEMM_L2x8_LOOP_END + ble CGEMM_L2x8_LOOP_END .align 5 -.LCGEMM_L2x8_LOOP: +CGEMM_L2x8_LOOP: KERNEL2x8_1 dcbt AO, PRE @@ -538,9 +540,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LCGEMM_L2x8_LOOP + bgt CGEMM_L2x8_LOOP -.LCGEMM_L2x8_LOOP_END: +CGEMM_L2x8_LOOP_END: KERNEL2x8_1 dcbt AO, PRE @@ -555,9 +557,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LCGEMM_L2x8_SUB1 + b CGEMM_L2x8_SUB1 -.LCGEMM_L2x8_SUB4: +CGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -569,53 +571,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LCGEMM_L2x8_SUB1 + b CGEMM_L2x8_SUB1 -.LCGEMM_L2x8_SUB0: +CGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x8_SAVE - b .LCGEMM_L2x8_SUB2 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 -.LCGEMM_L2x8_SUB1: +CGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x8_SAVE + ble CGEMM_L2x8_SAVE -.LCGEMM_L2x8_SUB2: +CGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x8_SUB2 + bgt CGEMM_L2x8_SUB2 -.LCGEMM_L2x8_SAVE: +CGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 - bgt .LCGEMM_L2x8_BEGIN + bgt CGEMM_L2x8_BEGIN -.LCGEMM_L2x8_END: +CGEMM_L2x8_END: -.LCGEMM_L2x4_BEGIN: +CGEMM_L2x4_BEGIN: andi. T2, M, 7 - ble .LCGEMM_L2x1_END + ble CGEMM_L2x1_END andi. T1, M, 4 - ble .LCGEMM_L2x4_END + ble CGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x4_SUB0 + ble CGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x4_SUB4 + ble CGEMM_L2x4_SUB4 -.LCGEMM_L2x4_LOOP_START: +CGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -629,11 +631,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LCGEMM_L2x4_LOOP_END + ble CGEMM_L2x4_LOOP_END .align 5 -.LCGEMM_L2x4_LOOP: +CGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -646,9 +648,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LCGEMM_L2x4_LOOP + bgt CGEMM_L2x4_LOOP -.LCGEMM_L2x4_LOOP_END: +CGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -660,9 +662,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LCGEMM_L2x4_SUB1 + b CGEMM_L2x4_SUB1 -.LCGEMM_L2x4_SUB4: +CGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -674,48 +676,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LCGEMM_L2x4_SUB1 + b CGEMM_L2x4_SUB1 -.LCGEMM_L2x4_SUB0: +CGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x4_SAVE - b .LCGEMM_L2x4_SUB2 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 -.LCGEMM_L2x4_SUB1: +CGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x4_SAVE + ble CGEMM_L2x4_SAVE -.LCGEMM_L2x4_SUB2: +CGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x4_SUB2 + bgt CGEMM_L2x4_SUB2 -.LCGEMM_L2x4_SAVE: +CGEMM_L2x4_SAVE: SAVE2x4 -.LCGEMM_L2x4_END: +CGEMM_L2x4_END: -.LCGEMM_L2x2_BEGIN: +CGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LCGEMM_L2x2_END + ble CGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x2_SUB0 + ble CGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x2_SUB4 + ble CGEMM_L2x2_SUB4 -.LCGEMM_L2x2_LOOP_START: +CGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -729,11 +731,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LCGEMM_L2x2_LOOP_END + ble CGEMM_L2x2_LOOP_END .align 5 -.LCGEMM_L2x2_LOOP: +CGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -746,9 +748,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LCGEMM_L2x2_LOOP + bgt CGEMM_L2x2_LOOP -.LCGEMM_L2x2_LOOP_END: +CGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -760,9 +762,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LCGEMM_L2x2_SUB1 + b CGEMM_L2x2_SUB1 -.LCGEMM_L2x2_SUB4: +CGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -774,48 +776,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LCGEMM_L2x2_SUB1 + b CGEMM_L2x2_SUB1 -.LCGEMM_L2x2_SUB0: +CGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x2_SAVE - b .LCGEMM_L2x2_SUB2 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 -.LCGEMM_L2x2_SUB1: +CGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x2_SAVE + ble CGEMM_L2x2_SAVE -.LCGEMM_L2x2_SUB2: +CGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x2_SUB2 + bgt CGEMM_L2x2_SUB2 -.LCGEMM_L2x2_SAVE: +CGEMM_L2x2_SAVE: SAVE2x2 -.LCGEMM_L2x2_END: +CGEMM_L2x2_END: -.LCGEMM_L2x1_BEGIN: +CGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LCGEMM_L2x1_END + ble CGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x1_SUB0 + ble CGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x1_SUB4 + ble CGEMM_L2x1_SUB4 -.LCGEMM_L2x1_LOOP_START: +CGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -829,11 +831,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LCGEMM_L2x1_LOOP_END + ble CGEMM_L2x1_LOOP_END .align 5 -.LCGEMM_L2x1_LOOP: +CGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -846,9 +848,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LCGEMM_L2x1_LOOP + bgt CGEMM_L2x1_LOOP -.LCGEMM_L2x1_LOOP_END: +CGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -860,9 +862,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LCGEMM_L2x1_SUB1 + b CGEMM_L2x1_SUB1 -.LCGEMM_L2x1_SUB4: +CGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -874,66 +876,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LCGEMM_L2x1_SUB1 + b CGEMM_L2x1_SUB1 -.LCGEMM_L2x1_SUB0: +CGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x1_SAVE - b .LCGEMM_L2x1_SUB2 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 -.LCGEMM_L2x1_SUB1: +CGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x1_SAVE + ble CGEMM_L2x1_SAVE -.LCGEMM_L2x1_SUB2: +CGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x1_SUB2 + bgt CGEMM_L2x1_SUB2 -.LCGEMM_L2x1_SAVE: +CGEMM_L2x1_SAVE: SAVE2x1 -.LCGEMM_L2x1_END: +CGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -.LCGEMM_L2_END: +CGEMM_L2_END: - b .LCGEMM_L1_BEGIN + b CGEMM_L1_BEGIN -.L999_H2: +L999_H2: - b .L999 + b L999 -.LCGEMM_L1_BEGIN: +CGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LCGEMM_L1_END + ble CGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 - ble .LCGEMM_L1x8_END + ble CGEMM_L1x8_END -.LCGEMM_L1x8_BEGIN: +CGEMM_L1x8_BEGIN: mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x8_SUB0 + ble CGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x8_SUB4 + ble CGEMM_L1x8_SUB4 -.LCGEMM_L1x8_LOOP_START: +CGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -952,11 +954,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LCGEMM_L1x8_LOOP_END + ble CGEMM_L1x8_LOOP_END .align 5 -.LCGEMM_L1x8_LOOP: +CGEMM_L1x8_LOOP: KERNEL1x8_1 dcbt AO, PRE @@ -973,9 +975,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LCGEMM_L1x8_LOOP + bgt CGEMM_L1x8_LOOP -.LCGEMM_L1x8_LOOP_END: +CGEMM_L1x8_LOOP_END: KERNEL1x8_1 dcbt AO, PRE @@ -990,9 +992,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LCGEMM_L1x8_SUB1 + b CGEMM_L1x8_SUB1 -.LCGEMM_L1x8_SUB4: +CGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1004,53 +1006,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LCGEMM_L1x8_SUB1 + b CGEMM_L1x8_SUB1 -.LCGEMM_L1x8_SUB0: +CGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x8_SAVE - b .LCGEMM_L1x8_SUB2 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 -.LCGEMM_L1x8_SUB1: +CGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x8_SAVE + ble CGEMM_L1x8_SAVE -.LCGEMM_L1x8_SUB2: +CGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x8_SUB2 + bgt CGEMM_L1x8_SUB2 -.LCGEMM_L1x8_SAVE: +CGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 - bgt .LCGEMM_L1x8_BEGIN + bgt CGEMM_L1x8_BEGIN -.LCGEMM_L1x8_END: +CGEMM_L1x8_END: -.LCGEMM_L1x4_BEGIN: +CGEMM_L1x4_BEGIN: andi. T2, M, 7 - ble .LCGEMM_L1x1_END + ble CGEMM_L1x1_END andi. T1, M, 4 - ble .LCGEMM_L1x4_END + ble CGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x4_SUB0 + ble CGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x4_SUB4 + ble CGEMM_L1x4_SUB4 -.LCGEMM_L1x4_LOOP_START: +CGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1064,11 +1066,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LCGEMM_L1x4_LOOP_END + ble CGEMM_L1x4_LOOP_END .align 5 -.LCGEMM_L1x4_LOOP: +CGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1081,9 +1083,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LCGEMM_L1x4_LOOP + bgt CGEMM_L1x4_LOOP -.LCGEMM_L1x4_LOOP_END: +CGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1095,9 +1097,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LCGEMM_L1x4_SUB1 + b CGEMM_L1x4_SUB1 -.LCGEMM_L1x4_SUB4: +CGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1109,48 +1111,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LCGEMM_L1x4_SUB1 + b CGEMM_L1x4_SUB1 -.LCGEMM_L1x4_SUB0: +CGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x4_SAVE - b .LCGEMM_L1x4_SUB2 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 -.LCGEMM_L1x4_SUB1: +CGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x4_SAVE + ble CGEMM_L1x4_SAVE -.LCGEMM_L1x4_SUB2: +CGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x4_SUB2 + bgt CGEMM_L1x4_SUB2 -.LCGEMM_L1x4_SAVE: +CGEMM_L1x4_SAVE: SAVE1x4 -.LCGEMM_L1x4_END: +CGEMM_L1x4_END: -.LCGEMM_L1x2_BEGIN: +CGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LCGEMM_L1x2_END + ble CGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x2_SUB0 + ble CGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x2_SUB4 + ble CGEMM_L1x2_SUB4 -.LCGEMM_L1x2_LOOP_START: +CGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1164,11 +1166,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LCGEMM_L1x2_LOOP_END + ble CGEMM_L1x2_LOOP_END .align 5 -.LCGEMM_L1x2_LOOP: +CGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1181,9 +1183,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LCGEMM_L1x2_LOOP + bgt CGEMM_L1x2_LOOP -.LCGEMM_L1x2_LOOP_END: +CGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1195,9 +1197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LCGEMM_L1x2_SUB1 + b CGEMM_L1x2_SUB1 -.LCGEMM_L1x2_SUB4: +CGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1209,48 +1211,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LCGEMM_L1x2_SUB1 + b CGEMM_L1x2_SUB1 -.LCGEMM_L1x2_SUB0: +CGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x2_SAVE - b .LCGEMM_L1x2_SUB2 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 -.LCGEMM_L1x2_SUB1: +CGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x2_SAVE + ble CGEMM_L1x2_SAVE -.LCGEMM_L1x2_SUB2: +CGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x2_SUB2 + bgt CGEMM_L1x2_SUB2 -.LCGEMM_L1x2_SAVE: +CGEMM_L1x2_SAVE: SAVE1x2 -.LCGEMM_L1x2_END: +CGEMM_L1x2_END: -.LCGEMM_L1x1_BEGIN: +CGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LCGEMM_L1x1_END + ble CGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x1_SUB0 + ble CGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x1_SUB4 + ble CGEMM_L1x1_SUB4 -.LCGEMM_L1x1_LOOP_START: +CGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1264,11 +1266,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LCGEMM_L1x1_LOOP_END + ble CGEMM_L1x1_LOOP_END .align 5 -.LCGEMM_L1x1_LOOP: +CGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1281,9 +1283,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LCGEMM_L1x1_LOOP + bgt CGEMM_L1x1_LOOP -.LCGEMM_L1x1_LOOP_END: +CGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1295,9 +1297,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LCGEMM_L1x1_SUB1 + b CGEMM_L1x1_SUB1 -.LCGEMM_L1x1_SUB4: +CGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1309,34 +1311,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LCGEMM_L1x1_SUB1 + b CGEMM_L1x1_SUB1 -.LCGEMM_L1x1_SUB0: +CGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x1_SAVE - b .LCGEMM_L1x1_SUB2 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 -.LCGEMM_L1x1_SUB1: +CGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x1_SAVE + ble CGEMM_L1x1_SAVE -.LCGEMM_L1x1_SUB2: +CGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x1_SUB2 + bgt CGEMM_L1x1_SUB2 -.LCGEMM_L1x1_SAVE: +CGEMM_L1x1_SAVE: SAVE1x1 -.LCGEMM_L1x1_END: +CGEMM_L1x1_END: -.LCGEMM_L1_END: +CGEMM_L1_END: diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S index 2085d3764..48a21252c 100644 --- a/kernel/power/cgemm_macros_8x4_power8.S +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -26,40 +26,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xssubsp - #define XSFADD_I1 xsaddsp - #define XSFADD_I2 xsaddsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvaddsp #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xsaddsp - #define XSFADD_I1 xssubsp - #define XSFADD_I2 xsaddsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvaddsp #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xsaddsp - #define XSFADD_I1 xsaddsp - #define XSFADD_I2 xssubsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvsubsp #else // CC || CR || RC || RR - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xssubsp - #define XSFADD_I1 xssubsp - #define XSFADD_I2 xssubsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvsubsp #endif @@ -172,24 +188,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_1 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs24, o0, BO // load b0, b1 lxvw4x vs4, o0, AO // load a0, a1 - xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i - - lxvw4x vs25, o16, BO // load b2, b3 lxvw4x vs5, o16, AO // load a2, a3 - xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -211,47 +245,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi BO, BO, 32 xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi AO, AO, 64 xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 .endm .macro KERNEL4x8_2 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs24, o0, BO // load b0, b1 lxvw4x vs0, o0, AO // load a0, a1 - xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i - - lxvw4x vs25, o16, BO // load b2, b3 lxvw4x vs1, o16, AO // load a2, a3 + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -273,26 +316,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi AO, AO, 64 xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi BO, BO, 32 xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 .endm @@ -501,51 +533,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -566,51 +599,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -631,51 +665,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -696,51 +731,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -767,51 +803,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs40, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 - stxvw4x vs41, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -832,51 +869,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs42, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 - stxvw4x vs43, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -897,51 +935,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs44, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 - stxvw4x vs45, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -962,51 +1001,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs46, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 - stxvw4x vs47, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1033,51 +1073,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs48, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs48, 0 + xxspltw vs9, vs48, 1 + xxspltw vs10, vs48, 2 + xxspltw vs11, vs48, 3 - stxvw4x vs49, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs49, 0 + xxspltw vs13, vs49, 1 + xxspltw vs14, vs49, 2 + xxspltw vs15, vs49, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1098,51 +1139,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs50, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs50, 0 + xxspltw vs9, vs50, 1 + xxspltw vs10, vs50, 2 + xxspltw vs11, vs50, 3 - stxvw4x vs51, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs51, 0 + xxspltw vs13, vs51, 1 + xxspltw vs14, vs51, 2 + xxspltw vs15, vs51, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1163,51 +1205,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs52, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs52, 0 + xxspltw vs9, vs52, 1 + xxspltw vs10, vs52, 2 + xxspltw vs11, vs52, 3 - stxvw4x vs53, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs53, 0 + xxspltw vs13, vs53, 1 + xxspltw vs14, vs53, 2 + xxspltw vs15, vs53, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1228,51 +1271,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs54, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs54, 0 + xxspltw vs9, vs54, 1 + xxspltw vs10, vs54, 2 + xxspltw vs11, vs54, 3 - stxvw4x vs55, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs55, 0 + xxspltw vs13, vs55, 1 + xxspltw vs14, vs55, 2 + xxspltw vs15, vs55, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1299,51 +1343,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs56, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs56, 0 + xxspltw vs9, vs56, 1 + xxspltw vs10, vs56, 2 + xxspltw vs11, vs56, 3 - stxvw4x vs57, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs57, 0 + xxspltw vs13, vs57, 1 + xxspltw vs14, vs57, 2 + xxspltw vs15, vs57, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1364,51 +1409,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs58, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs58, 0 + xxspltw vs9, vs58, 1 + xxspltw vs10, vs58, 2 + xxspltw vs11, vs58, 3 - stxvw4x vs59, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs59, 0 + xxspltw vs13, vs59, 1 + xxspltw vs14, vs59, 2 + xxspltw vs15, vs59, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1429,51 +1475,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs60, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs60, 0 + xxspltw vs9, vs60, 1 + xxspltw vs10, vs60, 2 + xxspltw vs11, vs60, 3 - stxvw4x vs61, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs61, 0 + xxspltw vs13, vs61, 1 + xxspltw vs14, vs61, 2 + xxspltw vs15, vs61, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1494,51 +1541,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs62, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs62, 0 + xxspltw vs9, vs62, 1 + xxspltw vs10, vs62, 2 + xxspltw vs11, vs62, 3 - stxvw4x vs63, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs63, 0 + xxspltw vs13, vs63, 1 + xxspltw vs14, vs63, 2 + xxspltw vs15, vs63, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1886,51 +1934,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1951,51 +2000,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2022,51 +2072,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2087,51 +2138,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2158,51 +2210,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs40, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 - stxvw4x vs41, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2223,51 +2276,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs42, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 - stxvw4x vs43, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2294,51 +2348,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs44, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 - stxvw4x vs45, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2359,51 +2414,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs46, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 - stxvw4x vs47, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2691,51 +2747,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2762,51 +2819,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2833,51 +2891,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2904,51 +2963,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3028,25 +3088,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r - xsmulsp vs40, vs0, vs12 // a0_r*b2_r - xsmulsp vs41, vs1, vs13 // a0_i*b2_i - xsmulsp vs42, vs0, vs13 // a0_r*b2_i - xsmulsp vs43, vs1, vs12 // a0_i*b2_r + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r - xsmulsp vs44, vs0, vs14 // a0_r*b3_r - xsmulsp vs45, vs1, vs15 // a0_i*b3_i - xsmulsp vs46, vs0, vs15 // a0_r*b3_i - xsmulsp vs47, vs1, vs14 // a0_i*b3_r + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3082,25 +3142,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r - xsmaddasp vs40, vs0, vs12 // a0_r*b2_r - xsmaddasp vs41, vs1, vs13 // a0_i*b2_i - xsmaddasp vs42, vs0, vs13 // a0_r*b2_i - xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r - xsmaddasp vs44, vs0, vs14 // a0_r*b3_r - xsmaddasp vs45, vs1, vs15 // a0_i*b3_i - xsmaddasp vs46, vs0, vs15 // a0_r*b3_i - xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3136,25 +3196,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r - xsmaddasp vs40, vs4, vs20 // a4_r*b2_r - xsmaddasp vs41, vs5, vs21 // a4_i*b2_i - xsmaddasp vs42, vs4, vs21 // a4_r*b2_i - xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r - xsmaddasp vs44, vs4, vs22 // a4_r*b3_r - xsmaddasp vs45, vs5, vs23 // a4_i*b3_i - xsmaddasp vs46, vs4, vs23 // a4_r*b3_i - xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm @@ -3162,25 +3222,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_E2 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r - xsmaddasp vs40, vs4, vs20 // a4_r*b2_r - xsmaddasp vs41, vs5, vs21 // a4_i*b2_i - xsmaddasp vs42, vs4, vs21 // a4_r*b2_i - xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r - xsmaddasp vs44, vs4, vs22 // a4_r*b3_r - xsmaddasp vs45, vs5, vs23 // a4_i*b3_i - xsmaddasp vs46, vs4, vs23 // a4_r*b3_i - xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm @@ -3216,25 +3276,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r - xsmulsp vs40, vs0, vs12 // a0_r*b2_r - xsmulsp vs41, vs1, vs13 // a0_i*b2_i - xsmulsp vs42, vs0, vs13 // a0_r*b2_i - xsmulsp vs43, vs1, vs12 // a0_i*b2_r + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r - xsmulsp vs44, vs0, vs14 // a0_r*b3_r - xsmulsp vs45, vs1, vs15 // a0_i*b3_i - xsmulsp vs46, vs0, vs15 // a0_r*b3_i - xsmulsp vs47, vs1, vs14 // a0_i*b3_r + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3270,25 +3330,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r - xsmaddasp vs40, vs0, vs12 // a0_r*b2_r - xsmaddasp vs41, vs1, vs13 // a0_i*b2_i - xsmaddasp vs42, vs0, vs13 // a0_r*b2_i - xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r - xsmaddasp vs44, vs0, vs14 // a0_r*b3_r - xsmaddasp vs45, vs1, vs15 // a0_i*b3_i - xsmaddasp vs46, vs0, vs15 // a0_r*b3_i - xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3320,16 +3380,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3362,16 +3422,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3404,16 +3464,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3446,16 +3506,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3773,51 +3833,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3838,51 +3899,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3903,51 +3965,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3968,51 +4031,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4039,51 +4103,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs40, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 - stxvw4x vs41, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4104,51 +4169,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs42, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 - stxvw4x vs43, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4169,51 +4235,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs44, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 - stxvw4x vs45, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4234,51 +4301,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs46, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 - stxvw4x vs47, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4524,51 +4592,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4589,51 +4658,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4660,51 +4730,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4725,51 +4796,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4979,51 +5051,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5050,51 +5123,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5154,15 +5228,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5188,15 +5262,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5222,15 +5296,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm @@ -5238,15 +5312,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_E2 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm @@ -5272,15 +5346,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5306,15 +5380,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5346,16 +5420,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -5388,16 +5462,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -5673,51 +5747,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5738,51 +5813,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5803,51 +5879,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5868,51 +5945,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6140,51 +6218,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6205,51 +6284,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6453,51 +6533,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6547,10 +6628,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6571,10 +6652,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6595,10 +6676,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm @@ -6606,10 +6687,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_E2 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm @@ -6630,10 +6711,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6654,10 +6735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6689,16 +6770,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index b15485751..b202114dd 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -129,18 +129,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 -#define alpha_vr vs28 -#define alpha_vi vs29 +#define alpha_dr vs28 +#define alpha_di vs29 +#define alpha_sr vs30 +#define alpha_si vs31 #define o12 r12 #define KKK r13 #define K1 r14 #define L r15 #define o16 r16 -#define TBUFFER r17 +#define NOTUSED r17 #define T2 r19 #define KK r20 #define o8 r21 @@ -278,21 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_8x4_power8.S" cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 256 + li PRE, 384 li o4 , 4 li o8 , 8 li o12 , 12 li o16 , 16 li o32 , 32 li o48 , 48 - addi TBUFFER, SP, 360 #ifdef __64BIT__ @@ -301,14 +300,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi T1, SP, 224 #endif - lxsspx alpha_r, 0, T1 - lxsspx alpha_i, o8, T1 + lxsspx alpha_dr, 0, T1 + lxsspx alpha_di, o8, T1 + addi T1, SP, 360 + li T2, 0 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_dr, o12, T1 + lxvw4x alpha_sr, o0 , T1 + addi T1, T1, 16 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_di, o12, T1 + lxvw4x alpha_si, o0 , T1 .align 5 #include "ctrmm_logic_8x4_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S index f9656e90b..3e50646b0 100644 --- a/kernel/power/ctrmm_logic_8x4_power8.S +++ b/kernel/power/ctrmm_logic_8x4_power8.S @@ -26,18 +26,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 2 - ble .LCTRMM_L4_END + ble CTRMM_L4_END -.LCTRMM_L4_BEGIN: +CTRMM_L4_BEGIN: mr CO, C mr AO, A @@ -49,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 3 - ble .LCTRMM_L4x8_END + ble CTRMM_L4x8_END -.LCTRMM_L4x8_BEGIN: +CTRMM_L4x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -78,11 +77,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x8_SUB0 + ble CTRMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x8_SUB4 + ble CTRMM_L4x8_SUB4 -.LCTRMM_L4x8_LOOP_START: +CTRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -96,11 +95,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LCTRMM_L4x8_LOOP_END + ble CTRMM_L4x8_LOOP_END .align 5 -.LCTRMM_L4x8_LOOP: +CTRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -113,9 +112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LCTRMM_L4x8_LOOP + bgt CTRMM_L4x8_LOOP -.LCTRMM_L4x8_LOOP_END: +CTRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -127,9 +126,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LCTRMM_L4x8_SUB1 + b CTRMM_L4x8_SUB1 -.LCTRMM_L4x8_SUB4: +CTRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -141,31 +140,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LCTRMM_L4x8_SUB1 + b CTRMM_L4x8_SUB1 -.LCTRMM_L4x8_SUB0: +CTRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x8_SAVE - b .LCTRMM_L4x8_SUB2 + ble CTRMM_L4x8_SAVE + b CTRMM_L4x8_SUB2 -.LCTRMM_L4x8_SUB1: +CTRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x8_SAVE + ble CTRMM_L4x8_SAVE -.LCTRMM_L4x8_SUB2: +CTRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x8_SUB2 + bgt CTRMM_L4x8_SUB2 -.LCTRMM_L4x8_SAVE: +CTRMM_L4x8_SAVE: SAVE4x8 @@ -183,16 +182,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LCTRMM_L4x8_BEGIN + bgt CTRMM_L4x8_BEGIN -.LCTRMM_L4x8_END: +CTRMM_L4x8_END: -.LCTRMM_L4x4_BEGIN: +CTRMM_L4x4_BEGIN: andi. T2, M, 7 - ble .LCTRMM_L4x1_END + ble CTRMM_L4x1_END andi. T1, M, 4 - ble .LCTRMM_L4x4_END + ble CTRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -218,11 +217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x4_SUB0 + ble CTRMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x4_SUB4 + ble CTRMM_L4x4_SUB4 -.LCTRMM_L4x4_LOOP_START: +CTRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -236,11 +235,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LCTRMM_L4x4_LOOP_END + ble CTRMM_L4x4_LOOP_END .align 5 -.LCTRMM_L4x4_LOOP: +CTRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -253,9 +252,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LCTRMM_L4x4_LOOP + bgt CTRMM_L4x4_LOOP -.LCTRMM_L4x4_LOOP_END: +CTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -267,9 +266,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LCTRMM_L4x4_SUB1 + b CTRMM_L4x4_SUB1 -.LCTRMM_L4x4_SUB4: +CTRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -281,31 +280,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LCTRMM_L4x4_SUB1 + b CTRMM_L4x4_SUB1 -.LCTRMM_L4x4_SUB0: +CTRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x4_SAVE - b .LCTRMM_L4x4_SUB2 + ble CTRMM_L4x4_SAVE + b CTRMM_L4x4_SUB2 -.LCTRMM_L4x4_SUB1: +CTRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x4_SAVE + ble CTRMM_L4x4_SAVE -.LCTRMM_L4x4_SUB2: +CTRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x4_SUB2 + bgt CTRMM_L4x4_SUB2 -.LCTRMM_L4x4_SAVE: +CTRMM_L4x4_SAVE: SAVE4x4 @@ -322,12 +321,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L4x4_END: +CTRMM_L4x4_END: -.LCTRMM_L4x2_BEGIN: +CTRMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LCTRMM_L4x2_END + ble CTRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -353,11 +352,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x2_SUB0 + ble CTRMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x2_SUB4 + ble CTRMM_L4x2_SUB4 -.LCTRMM_L4x2_LOOP_START: +CTRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -371,11 +370,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LCTRMM_L4x2_LOOP_END + ble CTRMM_L4x2_LOOP_END .align 5 -.LCTRMM_L4x2_LOOP: +CTRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -388,9 +387,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LCTRMM_L4x2_LOOP + bgt CTRMM_L4x2_LOOP -.LCTRMM_L4x2_LOOP_END: +CTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -402,9 +401,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LCTRMM_L4x2_SUB1 + b CTRMM_L4x2_SUB1 -.LCTRMM_L4x2_SUB4: +CTRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -416,31 +415,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LCTRMM_L4x2_SUB1 + b CTRMM_L4x2_SUB1 -.LCTRMM_L4x2_SUB0: +CTRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x2_SAVE - b .LCTRMM_L4x2_SUB2 + ble CTRMM_L4x2_SAVE + b CTRMM_L4x2_SUB2 -.LCTRMM_L4x2_SUB1: +CTRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x2_SAVE + ble CTRMM_L4x2_SAVE -.LCTRMM_L4x2_SUB2: +CTRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x2_SUB2 + bgt CTRMM_L4x2_SUB2 -.LCTRMM_L4x2_SAVE: +CTRMM_L4x2_SAVE: SAVE4x2 @@ -457,12 +456,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L4x2_END: +CTRMM_L4x2_END: -.LCTRMM_L4x1_BEGIN: +CTRMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LCTRMM_L4x1_END + ble CTRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -488,11 +487,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x1_SUB0 + ble CTRMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x1_SUB4 + ble CTRMM_L4x1_SUB4 -.LCTRMM_L4x1_LOOP_START: +CTRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -506,11 +505,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LCTRMM_L4x1_LOOP_END + ble CTRMM_L4x1_LOOP_END .align 5 -.LCTRMM_L4x1_LOOP: +CTRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -523,9 +522,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LCTRMM_L4x1_LOOP + bgt CTRMM_L4x1_LOOP -.LCTRMM_L4x1_LOOP_END: +CTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -537,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LCTRMM_L4x1_SUB1 + b CTRMM_L4x1_SUB1 -.LCTRMM_L4x1_SUB4: +CTRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -551,31 +550,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LCTRMM_L4x1_SUB1 + b CTRMM_L4x1_SUB1 -.LCTRMM_L4x1_SUB0: +CTRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x1_SAVE - b .LCTRMM_L4x1_SUB2 + ble CTRMM_L4x1_SAVE + b CTRMM_L4x1_SUB2 -.LCTRMM_L4x1_SUB1: +CTRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x1_SAVE + ble CTRMM_L4x1_SAVE -.LCTRMM_L4x1_SUB2: +CTRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x1_SUB2 + bgt CTRMM_L4x1_SUB2 -.LCTRMM_L4x1_SAVE: +CTRMM_L4x1_SAVE: SAVE4x1 @@ -592,7 +591,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L4x1_END: +CTRMM_L4x1_END: slwi T1, K, 5 add B, B, T1 @@ -603,23 +602,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. J, J, -1 - bgt .LCTRMM_L4_BEGIN + bgt CTRMM_L4_BEGIN andi. T2, N, 3 - ble .L999_H2 + ble L999_H2 -.LCTRMM_L4_END: +CTRMM_L4_END: - b .LCTRMM_L2_BEGIN + b CTRMM_L2_BEGIN -.L999_H1: +L999_H1: - b .L999_H2 + b L999_H2 -.LCTRMM_L2_BEGIN: +CTRMM_L2_BEGIN: andi. T1, N, 2 - ble .LCTRMM_L2_END + ble CTRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 @@ -630,9 +629,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 3 - ble .LCTRMM_L2x8_END + ble CTRMM_L2x8_END -.LCTRMM_L2x8_BEGIN: +CTRMM_L2x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -659,11 +658,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x8_SUB0 + ble CTRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x8_SUB4 + ble CTRMM_L2x8_SUB4 -.LCTRMM_L2x8_LOOP_START: +CTRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -677,11 +676,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LCTRMM_L2x8_LOOP_END + ble CTRMM_L2x8_LOOP_END .align 5 -.LCTRMM_L2x8_LOOP: +CTRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -694,9 +693,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LCTRMM_L2x8_LOOP + bgt CTRMM_L2x8_LOOP -.LCTRMM_L2x8_LOOP_END: +CTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -708,9 +707,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LCTRMM_L2x8_SUB1 + b CTRMM_L2x8_SUB1 -.LCTRMM_L2x8_SUB4: +CTRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -722,31 +721,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LCTRMM_L2x8_SUB1 + b CTRMM_L2x8_SUB1 -.LCTRMM_L2x8_SUB0: +CTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x8_SAVE - b .LCTRMM_L2x8_SUB2 + ble CTRMM_L2x8_SAVE + b CTRMM_L2x8_SUB2 -.LCTRMM_L2x8_SUB1: +CTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x8_SAVE + ble CTRMM_L2x8_SAVE -.LCTRMM_L2x8_SUB2: +CTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x8_SUB2 + bgt CTRMM_L2x8_SUB2 -.LCTRMM_L2x8_SAVE: +CTRMM_L2x8_SAVE: SAVE2x8 @@ -764,16 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LCTRMM_L2x8_BEGIN + bgt CTRMM_L2x8_BEGIN -.LCTRMM_L2x8_END: +CTRMM_L2x8_END: -.LCTRMM_L2x4_BEGIN: +CTRMM_L2x4_BEGIN: andi. T2, M, 7 - ble .LCTRMM_L2x1_END + ble CTRMM_L2x1_END andi. T1, M, 4 - ble .LCTRMM_L2x4_END + ble CTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -799,11 +798,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x4_SUB0 + ble CTRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x4_SUB4 + ble CTRMM_L2x4_SUB4 -.LCTRMM_L2x4_LOOP_START: +CTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -817,11 +816,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LCTRMM_L2x4_LOOP_END + ble CTRMM_L2x4_LOOP_END .align 5 -.LCTRMM_L2x4_LOOP: +CTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -834,9 +833,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LCTRMM_L2x4_LOOP + bgt CTRMM_L2x4_LOOP -.LCTRMM_L2x4_LOOP_END: +CTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -848,9 +847,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LCTRMM_L2x4_SUB1 + b CTRMM_L2x4_SUB1 -.LCTRMM_L2x4_SUB4: +CTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -862,31 +861,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LCTRMM_L2x4_SUB1 + b CTRMM_L2x4_SUB1 -.LCTRMM_L2x4_SUB0: +CTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x4_SAVE - b .LCTRMM_L2x4_SUB2 + ble CTRMM_L2x4_SAVE + b CTRMM_L2x4_SUB2 -.LCTRMM_L2x4_SUB1: +CTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x4_SAVE + ble CTRMM_L2x4_SAVE -.LCTRMM_L2x4_SUB2: +CTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x4_SUB2 + bgt CTRMM_L2x4_SUB2 -.LCTRMM_L2x4_SAVE: +CTRMM_L2x4_SAVE: SAVE2x4 @@ -903,12 +902,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2x4_END: +CTRMM_L2x4_END: -.LCTRMM_L2x2_BEGIN: +CTRMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LCTRMM_L2x2_END + ble CTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -934,11 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x2_SUB0 + ble CTRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x2_SUB4 + ble CTRMM_L2x2_SUB4 -.LCTRMM_L2x2_LOOP_START: +CTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -952,11 +951,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LCTRMM_L2x2_LOOP_END + ble CTRMM_L2x2_LOOP_END .align 5 -.LCTRMM_L2x2_LOOP: +CTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -969,9 +968,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LCTRMM_L2x2_LOOP + bgt CTRMM_L2x2_LOOP -.LCTRMM_L2x2_LOOP_END: +CTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -983,9 +982,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LCTRMM_L2x2_SUB1 + b CTRMM_L2x2_SUB1 -.LCTRMM_L2x2_SUB4: +CTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -997,31 +996,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LCTRMM_L2x2_SUB1 + b CTRMM_L2x2_SUB1 -.LCTRMM_L2x2_SUB0: +CTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x2_SAVE - b .LCTRMM_L2x2_SUB2 + ble CTRMM_L2x2_SAVE + b CTRMM_L2x2_SUB2 -.LCTRMM_L2x2_SUB1: +CTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x2_SAVE + ble CTRMM_L2x2_SAVE -.LCTRMM_L2x2_SUB2: +CTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x2_SUB2 + bgt CTRMM_L2x2_SUB2 -.LCTRMM_L2x2_SAVE: +CTRMM_L2x2_SAVE: SAVE2x2 @@ -1038,12 +1037,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2x2_END: +CTRMM_L2x2_END: -.LCTRMM_L2x1_BEGIN: +CTRMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LCTRMM_L2x1_END + ble CTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1069,11 +1068,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x1_SUB0 + ble CTRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x1_SUB4 + ble CTRMM_L2x1_SUB4 -.LCTRMM_L2x1_LOOP_START: +CTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1087,11 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LCTRMM_L2x1_LOOP_END + ble CTRMM_L2x1_LOOP_END .align 5 -.LCTRMM_L2x1_LOOP: +CTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1104,9 +1103,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LCTRMM_L2x1_LOOP + bgt CTRMM_L2x1_LOOP -.LCTRMM_L2x1_LOOP_END: +CTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1118,9 +1117,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LCTRMM_L2x1_SUB1 + b CTRMM_L2x1_SUB1 -.LCTRMM_L2x1_SUB4: +CTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1132,31 +1131,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LCTRMM_L2x1_SUB1 + b CTRMM_L2x1_SUB1 -.LCTRMM_L2x1_SUB0: +CTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x1_SAVE - b .LCTRMM_L2x1_SUB2 + ble CTRMM_L2x1_SAVE + b CTRMM_L2x1_SUB2 -.LCTRMM_L2x1_SUB1: +CTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x1_SAVE + ble CTRMM_L2x1_SAVE -.LCTRMM_L2x1_SUB2: +CTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x1_SUB2 + bgt CTRMM_L2x1_SUB2 -.LCTRMM_L2x1_SAVE: +CTRMM_L2x1_SAVE: SAVE2x1 @@ -1173,7 +1172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2x1_END: +CTRMM_L2x1_END: slwi T1, K, 4 add B, B, T1 @@ -1183,18 +1182,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2_END: +CTRMM_L2_END: - b .LCTRMM_L1_BEGIN + b CTRMM_L1_BEGIN -.L999_H2: +L999_H2: - b .L999 + b L999 -.LCTRMM_L1_BEGIN: +CTRMM_L1_BEGIN: andi. T1, N, 1 - ble .LCTRMM_L1_END + ble CTRMM_L1_END mr CO, C mr AO, A @@ -1203,9 +1202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 3 - ble .LCTRMM_L1x8_END + ble CTRMM_L1x8_END -.LCTRMM_L1x8_BEGIN: +CTRMM_L1x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1232,11 +1231,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x8_SUB0 + ble CTRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x8_SUB4 + ble CTRMM_L1x8_SUB4 -.LCTRMM_L1x8_LOOP_START: +CTRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1250,11 +1249,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LCTRMM_L1x8_LOOP_END + ble CTRMM_L1x8_LOOP_END .align 5 -.LCTRMM_L1x8_LOOP: +CTRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1267,9 +1266,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LCTRMM_L1x8_LOOP + bgt CTRMM_L1x8_LOOP -.LCTRMM_L1x8_LOOP_END: +CTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1281,9 +1280,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LCTRMM_L1x8_SUB1 + b CTRMM_L1x8_SUB1 -.LCTRMM_L1x8_SUB4: +CTRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1295,31 +1294,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LCTRMM_L1x8_SUB1 + b CTRMM_L1x8_SUB1 -.LCTRMM_L1x8_SUB0: +CTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x8_SAVE - b .LCTRMM_L1x8_SUB2 + ble CTRMM_L1x8_SAVE + b CTRMM_L1x8_SUB2 -.LCTRMM_L1x8_SUB1: +CTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x8_SAVE + ble CTRMM_L1x8_SAVE -.LCTRMM_L1x8_SUB2: +CTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x8_SUB2 + bgt CTRMM_L1x8_SUB2 -.LCTRMM_L1x8_SAVE: +CTRMM_L1x8_SAVE: SAVE1x8 @@ -1337,16 +1336,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LCTRMM_L1x8_BEGIN + bgt CTRMM_L1x8_BEGIN -.LCTRMM_L1x8_END: +CTRMM_L1x8_END: -.LCTRMM_L1x4_BEGIN: +CTRMM_L1x4_BEGIN: andi. T2, M, 7 - ble .LCTRMM_L1x1_END + ble CTRMM_L1x1_END andi. T1, M, 4 - ble .LCTRMM_L1x4_END + ble CTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1372,11 +1371,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x4_SUB0 + ble CTRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x4_SUB4 + ble CTRMM_L1x4_SUB4 -.LCTRMM_L1x4_LOOP_START: +CTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1390,11 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LCTRMM_L1x4_LOOP_END + ble CTRMM_L1x4_LOOP_END .align 5 -.LCTRMM_L1x4_LOOP: +CTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1407,9 +1406,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LCTRMM_L1x4_LOOP + bgt CTRMM_L1x4_LOOP -.LCTRMM_L1x4_LOOP_END: +CTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1421,9 +1420,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LCTRMM_L1x4_SUB1 + b CTRMM_L1x4_SUB1 -.LCTRMM_L1x4_SUB4: +CTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1435,31 +1434,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LCTRMM_L1x4_SUB1 + b CTRMM_L1x4_SUB1 -.LCTRMM_L1x4_SUB0: +CTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x4_SAVE - b .LCTRMM_L1x4_SUB2 + ble CTRMM_L1x4_SAVE + b CTRMM_L1x4_SUB2 -.LCTRMM_L1x4_SUB1: +CTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x4_SAVE + ble CTRMM_L1x4_SAVE -.LCTRMM_L1x4_SUB2: +CTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x4_SUB2 + bgt CTRMM_L1x4_SUB2 -.LCTRMM_L1x4_SAVE: +CTRMM_L1x4_SAVE: SAVE1x4 @@ -1476,12 +1475,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L1x4_END: +CTRMM_L1x4_END: -.LCTRMM_L1x2_BEGIN: +CTRMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LCTRMM_L1x2_END + ble CTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1507,11 +1506,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x2_SUB0 + ble CTRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x2_SUB4 + ble CTRMM_L1x2_SUB4 -.LCTRMM_L1x2_LOOP_START: +CTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1525,11 +1524,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LCTRMM_L1x2_LOOP_END + ble CTRMM_L1x2_LOOP_END .align 5 -.LCTRMM_L1x2_LOOP: +CTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1542,9 +1541,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LCTRMM_L1x2_LOOP + bgt CTRMM_L1x2_LOOP -.LCTRMM_L1x2_LOOP_END: +CTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1556,9 +1555,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LCTRMM_L1x2_SUB1 + b CTRMM_L1x2_SUB1 -.LCTRMM_L1x2_SUB4: +CTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1570,31 +1569,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LCTRMM_L1x2_SUB1 + b CTRMM_L1x2_SUB1 -.LCTRMM_L1x2_SUB0: +CTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x2_SAVE - b .LCTRMM_L1x2_SUB2 + ble CTRMM_L1x2_SAVE + b CTRMM_L1x2_SUB2 -.LCTRMM_L1x2_SUB1: +CTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x2_SAVE + ble CTRMM_L1x2_SAVE -.LCTRMM_L1x2_SUB2: +CTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x2_SUB2 + bgt CTRMM_L1x2_SUB2 -.LCTRMM_L1x2_SAVE: +CTRMM_L1x2_SAVE: SAVE1x2 @@ -1611,12 +1610,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L1x2_END: +CTRMM_L1x2_END: -.LCTRMM_L1x1_BEGIN: +CTRMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LCTRMM_L1x1_END + ble CTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1642,11 +1641,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x1_SUB0 + ble CTRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x1_SUB4 + ble CTRMM_L1x1_SUB4 -.LCTRMM_L1x1_LOOP_START: +CTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1660,11 +1659,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LCTRMM_L1x1_LOOP_END + ble CTRMM_L1x1_LOOP_END .align 5 -.LCTRMM_L1x1_LOOP: +CTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1677,9 +1676,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LCTRMM_L1x1_LOOP + bgt CTRMM_L1x1_LOOP -.LCTRMM_L1x1_LOOP_END: +CTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1691,9 +1690,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LCTRMM_L1x1_SUB1 + b CTRMM_L1x1_SUB1 -.LCTRMM_L1x1_SUB4: +CTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1705,31 +1704,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LCTRMM_L1x1_SUB1 + b CTRMM_L1x1_SUB1 -.LCTRMM_L1x1_SUB0: +CTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x1_SAVE - b .LCTRMM_L1x1_SUB2 + ble CTRMM_L1x1_SAVE + b CTRMM_L1x1_SUB2 -.LCTRMM_L1x1_SUB1: +CTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x1_SAVE + ble CTRMM_L1x1_SAVE -.LCTRMM_L1x1_SUB2: +CTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x1_SUB2 + bgt CTRMM_L1x1_SUB2 -.LCTRMM_L1x1_SAVE: +CTRMM_L1x1_SAVE: SAVE1x1 @@ -1746,11 +1745,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L1x1_END: +CTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -.LCTRMM_L1_END: +CTRMM_L1_END: