diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 440eaab1b..126313c9a 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -6,7 +6,7 @@ STRMMKERNEL = sgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index a41bcec77..813f270b8 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T8 r16 #define T5 r17 #define T2 r19 -#define T9 r20 +#define TEMP_REG r20 #define T6 r21 #define I r22 #define J r23 diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index 01685fe79..f902484a3 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -26,972 +26,1866 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define MY_ALIGN .align 3 b ZGEMM_L2 - -/* MINI SUBROUTINES */ +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ - -/* 2x8 MAIN 128x+1 LOOP */ -ZGEMM_L2x8_LMAIN_SUB: - mtctr L - LOAD2x8 0 - MY_ALIGN +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN ZGEMM_L2x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,0 - KERNEL2x8_L 128,32,16,0 - KERNEL2x8_L 128,32,17,0 - KERNEL2x8_L 128,32,18,0 - KERNEL2x8_L 128,32,19,0 - KERNEL2x8_L 128,32,20,0 - KERNEL2x8_L 128,32,21,0 - KERNEL2x8_L 128,32,22,0 - KERNEL2x8_L 128,32,23,0 - KERNEL2x8_L 128,32,24,0 - KERNEL2x8_L 128,32,25,0 - KERNEL2x8_L 128,32,26,0 - KERNEL2x8_L 128,32,27,0 - KERNEL2x8_L 128,32,28,0 - KERNEL2x8_L 128,32,29,0 - KERNEL2x8_L 128,32,30,0 - KERNEL2x8_L 128,32,31,0 - KERNEL2x8_L 128,32,32,0 - KERNEL2x8_L 128,32,33,0 - KERNEL2x8_L 128,32,34,0 - KERNEL2x8_L 128,32,35,0 - KERNEL2x8_L 128,32,36,0 - KERNEL2x8_L 128,32,37,0 - KERNEL2x8_L 128,32,38,0 - KERNEL2x8_L 128,32,39,0 - KERNEL2x8_L 128,32,40,0 - KERNEL2x8_L 128,32,41,0 - KERNEL2x8_L 128,32,42,0 - KERNEL2x8_L 128,32,43,0 - KERNEL2x8_L 128,32,44,0 - KERNEL2x8_L 128,32,45,0 - KERNEL2x8_L 128,32,46,0 - KERNEL2x8_L 128,32,47,0 - KERNEL2x8_L 128,32,48,0 - KERNEL2x8_L 128,32,49,0 - KERNEL2x8_L 128,32,50,0 - KERNEL2x8_L 128,32,51,0 - KERNEL2x8_L 128,32,52,0 - KERNEL2x8_L 128,32,53,0 - KERNEL2x8_L 128,32,54,0 - KERNEL2x8_L 128,32,55,0 - KERNEL2x8_L 128,32,56,0 - KERNEL2x8_L 128,32,57,0 - KERNEL2x8_L 128,32,58,0 - KERNEL2x8_L 128,32,59,0 - KERNEL2x8_L 128,32,60,0 - KERNEL2x8_L 128,32,61,0 - KERNEL2x8_L 128,32,62,0 - KERNEL2x8_L 128,32,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_L2 256,64,31,0 + KERNEL2x8_L2 256,64,32,0 + KERNEL2x8_L2 256,64,33,0 + KERNEL2x8_L2 256,64,34,0 + KERNEL2x8_L2 256,64,35,0 + KERNEL2x8_L2 256,64,36,0 + KERNEL2x8_L2 256,64,37,0 + KERNEL2x8_L2 256,64,38,0 + KERNEL2x8_L2 256,64,39,0 + KERNEL2x8_L2 256,64,40,0 + KERNEL2x8_L2 256,64,41,0 + KERNEL2x8_L2 256,64,42,0 + KERNEL2x8_L2 256,64,43,0 + KERNEL2x8_L2 256,64,44,0 + KERNEL2x8_L2 256,64,45,0 + KERNEL2x8_L2 256,64,46,0 + KERNEL2x8_L2 256,64,47,0 + KERNEL2x8_L2 256,64,48,0 + KERNEL2x8_L2 256,64,49,0 + KERNEL2x8_L2 256,64,50,0 + KERNEL2x8_L2 256,64,51,0 + KERNEL2x8_L2 256,64,52,0 + KERNEL2x8_L2 256,64,53,0 + KERNEL2x8_L2 256,64,54,0 + KERNEL2x8_L2 256,64,55,0 + KERNEL2x8_L2 256,64,56,0 + KERNEL2x8_L2 256,64,57,0 + KERNEL2x8_L2 256,64,58,0 + KERNEL2x8_L2 256,64,59,0 + KERNEL2x8_L2 256,64,60,0 + KERNEL2x8_L2 256,64,61,0 + KERNEL2x8_L2 256,64,62,0 + KERNEL2x8_L2 256,64,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN ZGEMM_L2x8_LOOP_END: - END2x8 AO, BO, 128,32 - blr - +/*----------------------------------------*/ + END2x8_2 + blr MY_ALIGN + + ZGEMM_2x8_L64_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,0 - KERNEL2x8_L 128,32,16,0 - KERNEL2x8_L 128,32,17,0 - KERNEL2x8_L 128,32,18,0 - KERNEL2x8_L 128,32,19,0 - KERNEL2x8_L 128,32,20,0 - KERNEL2x8_L 128,32,21,0 - KERNEL2x8_L 128,32,22,0 - KERNEL2x8_L 128,32,23,0 - KERNEL2x8_L 128,32,24,0 - KERNEL2x8_L 128,32,25,0 - KERNEL2x8_L 128,32,26,0 - KERNEL2x8_L 128,32,27,0 - KERNEL2x8_L 128,32,28,0 - KERNEL2x8_L 128,32,29,0 - KERNEL2x8_L 128,32,30,0 - KERNEL2x8_E 128,32,31,1 - blr - - +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_E2 256,64,31,1 + blr MY_ALIGN + + ZGEMM_2x8_L32_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L 128,32,8,0 - KERNEL2x8_L 128,32,9,0 - KERNEL2x8_L 128,32,10,0 - KERNEL2x8_L 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L 128,32,12,0 - KERNEL2x8_L 128,32,13,0 - KERNEL2x8_L 128,32,14,0 - KERNEL2x8_L 128,32,15,1 - blr +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_E2 256,64,15,1 + blr MY_ALIGN + ZGEMM_2x8_L16_SUB: - LOAD2x8 0 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L 128,32,0,0 - KERNEL2x8_L 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L 128,32,2,0 - KERNEL2x8_L 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L 128,32,4,0 - KERNEL2x8_L 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L 128,32,6,0 - KERNEL2x8_L 128,32,7,1 - blr - MY_ALIGN +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_E2 256,64,7,1 + blr + MY_ALIGN + ZGEMM_2x4_LMAIN_SUB: - mtctr L - LOAD2x4 0 - MY_ALIGN -ZGEMM_L2x4_LOOP: - KERNEL2x4_L 64,32,0,0 - KERNEL2x4_L 64,32,1,0 - KERNEL2x4_L 64,32,2,0 - KERNEL2x4_L 64,32,3,0 - KERNEL2x4_L 64,32,4,0 - KERNEL2x4_L 64,32,5,0 - KERNEL2x4_L 64,32,6,0 - KERNEL2x4_L 64,32,7,0 - KERNEL2x4_L 64,32,8,0 - KERNEL2x4_L 64,32,9,0 - KERNEL2x4_L 64,32,10,0 - KERNEL2x4_L 64,32,11,0 - KERNEL2x4_L 64,32,12,0 - KERNEL2x4_L 64,32,13,0 - KERNEL2x4_L 64,32,14,0 - KERNEL2x4_L 64,32,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,0,0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_L2 128,64,7,0 + KERNEL2x4_L2 128,64,8,0 + KERNEL2x4_L2 128,64,9,0 + KERNEL2x4_L2 128,64,10,0 + KERNEL2x4_L2 128,64,11,0 + KERNEL2x4_L2 128,64,12,0 + KERNEL2x4_L2 128,64,13,0 + KERNEL2x4_L2 128,64,14,0 + KERNEL2x4_L2 128,64,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN ZGEMM_L2x4_LOOP_END: - END2x4 AO, BO, 64,32 - blr - +/*----------------------------------------*/ + END2x4_2 + blr MY_ALIGN + + ZGEMM_2x4_L16_SUB: - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_L 64,32, 1,0 - KERNEL2x4_L 64,32, 2,0 - KERNEL2x4_L 64,32, 3,0 - KERNEL2x4_L 64,32, 4,0 - KERNEL2x4_L 64,32, 5,0 - KERNEL2x4_L 64,32, 6,0 - KERNEL2x4_E 64,32, 7,1 +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_E2 128,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_E2 128,64,3,1 + blr + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,0,0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_L2 64,64,7,0 + KERNEL2x2_L2 64,64,8,0 + KERNEL2x2_L2 64,64,9,0 + KERNEL2x2_L2 64,64,10,0 + KERNEL2x2_L2 64,64,11,0 + KERNEL2x2_L2 64,64,12,0 + KERNEL2x2_L2 64,64,13,0 + KERNEL2x2_L2 64,64,14,0 + KERNEL2x2_L2 64,64,15,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +ZGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_E2 64,64,7,1 + blr + MY_ALIGN +ZGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_E2 64,64,3,1 + blr + + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,0,0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_L2 32,64,7,0 + KERNEL2x1_L2 32,64,8,0 + KERNEL2x1_L2 32,64,9,0 + KERNEL2x1_L2 32,64,10,0 + KERNEL2x1_L2 32,64,11,0 + KERNEL2x1_L2 32,64,12,0 + KERNEL2x1_L2 32,64,13,0 + KERNEL2x1_L2 32,64,14,0 + KERNEL2x1_L2 32,64,15,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 blr MY_ALIGN -ZGEMM_2x4_L8_SUB: - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_L 64,32, 1,0 - KERNEL2x4_L 64,32, 2,0 - KERNEL2x4_E 64,32, 3,1 +ZGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_E2 32,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_E2 32,64,3,1 blr -/* MAIN LOOP BEGINS */ - MY_ALIGN + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + ZGEMM_L2: - srawi. J, N, 1 - ble ZGEMM_L2_END +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + ble ZGEMM_L2_END + ZGEMM_L2_BEGIN: - mr CO, C - slwi T1, LDC , 1 +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 add T2,C,LDC - mr AO, A - add C, C, T1 - srawi. I, M, 3 - ble ZGEMM_L2x8_END + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L2x8_END dcbt CO,r0 /*just prefetch*/ dcbt T2,r0 -ZGEMM_L2x8_BEGIN: - mr T1, K - mr BO, B - dcbt B, r0 - dcbt AO, r0 - /* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -1 - /* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. L, T1, 7 /**(K-1) % 128x */ - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - -ZGEMM_L2x8_SUB0: - andi. L, K, 255 +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8O 128,32 + END2x8_WITHOUT_ADD + LOAD2x8_2O 256, 64 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else cmpwi K,128 - bne ZGEMM_L2x8_SUB2 - MY_ALIGN -ZGEMM_L2x8_SUB2_128: - bl ZGEMM_2x8_L64_SUB - bl ZGEMM_2x8_L64_SUB - b ZGEMM_L2x8_SAVE +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-256 + LOAD2x8_2O 256,64 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE MY_ALIGN + + ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB MY_ALIGN + + ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB MY_ALIGN + + ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ andi. T1,L, 16 ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN + bl ZGEMM_2x8_L16_SUB + MY_ALIGN + + ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ andi. T1,L, 8 ble ZGEMM_L2x8_SUB2_4 - LOAD2x8 0 - KERNEL2x8_L 128,32, 0,0 - KERNEL2x8_L 128,32, 1,0 - KERNEL2x8_L 128,32, 2,0 - KERNEL2x8_E 128,32, 3,1 - MY_ALIGN + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_L2 256,64, 1,0 + KERNEL2x8_L2 256,64, 2,0 + KERNEL2x8_E2 256,64, 3,1 + MY_ALIGN + + ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x8_SUB2_2 - LOAD2x8 0 - KERNEL2x8_L 128,32, 0,0 - KERNEL2x8_E 128,32, 1,1 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_E2 256,64, 1,1 MY_ALIGN + + ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x8_SUB2_1 - LOAD2x8 0 - KERNEL2x8_E 128,32, 0,1 + LOAD2x8_2 + KERNEL2x8_E2 256,64, 0,1 MY_ALIGN + + ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + ZGEMM_L2x8_SAVE: - addic. I, I, -1 - SAVE2x8 +/*----------------------------------------*/ + addic. I, I, -1 + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN - bgt ZGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN ZGEMM_L2x8_END: +/*----------------------------------------*/ + ZGEMM_L2x4_BEGIN: - - andi. T2, M, 7 - ble ZGEMM_L2x1_END - - andi. T1, M, 4 - ble ZGEMM_L2x4_END - mr BO, B - mr T1, K - addi T1,T1, -1 - ZERO2x4 - srawi. L, T1, 5 /**(K-1) % 32x */ - - ble ZGEMM_L2x4_SUB0 +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble ZGEMM_L2x4_SUB0 bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + ZGEMM_L2x4_SUB0: - andi. L, K, 63 +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4O 64,32 + END2x4_WITHOUT_ADD + LOAD2x4_2O 128, 64 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else cmpwi K,32 - bne ZGEMM_L2x4_SUB2 - MY_ALIGN -ZGEMM_L2x4_SUB2_32: - bl ZGEMM_2x4_L16_SUB - bl ZGEMM_2x4_L16_SUB - b ZGEMM_L2x4_SAVE +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD2x4_2O 128,64 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE MY_ALIGN -ZGEMM_L2x4_SUB2: + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ andi. T1,L, 16 ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB + bl ZGEMM_2x4_L16_SUB MY_ALIGN -ZGEMM_L2x4_SUB2_8: + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ andi. T1,L, 8 ble ZGEMM_L2x4_SUB2_4 bl ZGEMM_2x4_L8_SUB MY_ALIGN + + ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x4_SUB2_2 - LOAD2x4 0 - KERNEL2x4_L 64,32, 0,0 - KERNEL2x4_E 64,32, 1,1 + LOAD2x4_2 + KERNEL2x4_L2 128,64, 0,0 + KERNEL2x4_E2 128,64, 1,1 MY_ALIGN + + ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x4_SUB2_1 - LOAD2x4 0 - KERNEL2x4_E 64,32, 0,1 + LOAD2x4_2 + KERNEL2x4_E2 128,64, 0,1 MY_ALIGN + + ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif - SAVE2x4 ZGEMM_L2x4_END: +/*----------------------------------------*/ -ZGEMM_L2x2_BEGIN: - andi. T1, M, 2 - ble ZGEMM_L2x2_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x2 - ble ZGEMM_L2x2_SUB0 +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 -ZGEMM_L2x2_LOOP_START: - LOAD2x2 0 - mtctr L - MY_ALIGN -ZGEMM_L2x2_LOOP: - KERNEL2x2_L 32,32,0,0 - KERNEL2x2_L 32,32,1,0 - KERNEL2x2_L 32,32,2,0 - KERNEL2x2_L 32,32,3,0 - KERNEL2x2_L 32,32,4,0 - KERNEL2x2_L 32,32,5,0 - KERNEL2x2_L 32,32,6,0 - KERNEL2x2_L 32,32,7,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN -ZGEMM_L2x2_LOOP_END: - END2x2 AO, BO, 32,32 - - b ZGEMM_L2x2_SUB1 - ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2O 32,32 + END2x2_WITHOUT_ADD + LOAD2x2_2O 64, 64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD2x2_2O 64,64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 31 - - b ZGEMM_L2x2_SUB2 - -ZGEMM_L2x2_SUB1: - - andi. L, T1, 15 - ble ZGEMM_L2x2_SAVE ZGEMM_L2x2_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x2_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + bl ZGEMM_2x2_L16_SUB MY_ALIGN -ZGEMM_L2x2_SUB2_LOOP: - LOAD2x2 0 - KERNEL2x2_L 32,32, 0,0 - KERNEL2x2_L 32,32, 1,0 - KERNEL2x2_L 32,32, 2,0 - KERNEL2x2_E 32,32, 3,1 - bdnz ZGEMM_L2x2_SUB2_LOOP + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + bl ZGEMM_2x2_L8_SUB MY_ALIGN + + ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x2_SUB2_2 - LOAD2x2 0 - KERNEL2x2_L 32,32, 0,0 - KERNEL2x2_E 32,32, 1,1 + LOAD2x2_2 + KERNEL2x2_L2 64,64, 0,0 + KERNEL2x2_E2 64,64, 1,1 MY_ALIGN + + ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x2_SUB2_1 - LOAD2x2 0 - KERNEL2x2_E 32,32, 0,1 + LOAD2x2_2 + KERNEL2x2_E2 64,64, 0,1 MY_ALIGN -ZGEMM_L2x2_SUB2_1: - andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 -ZGEMM_L2x2_SAVE: - SAVE2x2 + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + ZGEMM_L2x2_END: +/*----------------------------------------*/ +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 -ZGEMM_L2x1_BEGIN: - andi. T1, M, 1 - ble ZGEMM_L2x1_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 4 /**(K-1) % 16x */ - ZERO2x1 - ble ZGEMM_L2x1_SUB0 -ZGEMM_L2x1_LOOP_START: - - LOAD2x1 0 - mtctr L - - MY_ALIGN -ZGEMM_L2x1_LOOP: - KERNEL2x1_L 16,32,0,0 - KERNEL2x1_L 16,32,1,0 - KERNEL2x1_L 16,32,2,0 - KERNEL2x1_L 16,32,3,0 - KERNEL2x1_L 16,32,4,0 - KERNEL2x1_L 16,32,5,0 - KERNEL2x1_L 16,32,6,0 - KERNEL2x1_L 16,32,7,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: - END2x1 AO, BO, 16,32 - - b ZGEMM_L2x1_SUB1 - ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1O 16,32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD2x1_2O 32,64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 31 - - b ZGEMM_L2x1_SUB2 - -ZGEMM_L2x1_SUB1: - - andi. L, T1, 15 - ble ZGEMM_L2x1_SAVE ZGEMM_L2x1_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L2x1_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + bl ZGEMM_2x1_L16_SUB MY_ALIGN -ZGEMM_L2x1_SUB2_LOOP: - LOAD2x1 0 - KERNEL2x1_L 16,32, 0,0 - KERNEL2x1_L 16,32, 1,0 - KERNEL2x1_L 16,32, 2,0 - KERNEL2x1_E 16,32, 3,1 - bdnz ZGEMM_L2x1_SUB2_LOOP + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + bl ZGEMM_2x1_L8_SUB MY_ALIGN + + ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L2x1_SUB2_2 - LOAD2x1 0 - KERNEL2x1_L 16,32, 0,0 - KERNEL2x1_E 16,32, 1,1 + LOAD2x1_2 + KERNEL2x1_L2 32,64, 0,0 + KERNEL2x1_E2 32,64, 1,1 MY_ALIGN + + ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L2x1_SUB2_1 - LOAD2x1 0 - KERNEL2x1_E 16,32, 0,1 + LOAD2x1_2 + KERNEL2x1_E2 32,64, 0,1 MY_ALIGN + + ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif - SAVE2x1 ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + bgt ZGEMM_L2_BEGIN - slwi T1, K, 5 - add B, B, T1 - - addic. J, J, -1 - bgt ZGEMM_L2_BEGIN - - andi. T2, N, 1 - ble L999 ZGEMM_L2_END: - b ZGEMM_L1_BEGIN +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ -L999_H1: - b L999 +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_L2 256,32,31,0 + KERNEL1x8_L2 256,32,32,0 + KERNEL1x8_L2 256,32,33,0 + KERNEL1x8_L2 256,32,34,0 + KERNEL1x8_L2 256,32,35,0 + KERNEL1x8_L2 256,32,36,0 + KERNEL1x8_L2 256,32,37,0 + KERNEL1x8_L2 256,32,38,0 + KERNEL1x8_L2 256,32,39,0 + KERNEL1x8_L2 256,32,40,0 + KERNEL1x8_L2 256,32,41,0 + KERNEL1x8_L2 256,32,42,0 + KERNEL1x8_L2 256,32,43,0 + KERNEL1x8_L2 256,32,44,0 + KERNEL1x8_L2 256,32,45,0 + KERNEL1x8_L2 256,32,46,0 + KERNEL1x8_L2 256,32,47,0 + KERNEL1x8_L2 256,32,48,0 + KERNEL1x8_L2 256,32,49,0 + KERNEL1x8_L2 256,32,50,0 + KERNEL1x8_L2 256,32,51,0 + KERNEL1x8_L2 256,32,52,0 + KERNEL1x8_L2 256,32,53,0 + KERNEL1x8_L2 256,32,54,0 + KERNEL1x8_L2 256,32,55,0 + KERNEL1x8_L2 256,32,56,0 + KERNEL1x8_L2 256,32,57,0 + KERNEL1x8_L2 256,32,58,0 + KERNEL1x8_L2 256,32,59,0 + KERNEL1x8_L2 256,32,60,0 + KERNEL1x8_L2 256,32,61,0 + KERNEL1x8_L2 256,32,62,0 + KERNEL1x8_L2 256,32,63,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + +ZGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_E2 256,32,31,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_E2 256,32,15,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_E2 256,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,0,0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_L2 128,32,7,0 + KERNEL1x4_L2 128,32,8,0 + KERNEL1x4_L2 128,32,9,0 + KERNEL1x4_L2 128,32,10,0 + KERNEL1x4_L2 128,32,11,0 + KERNEL1x4_L2 128,32,12,0 + KERNEL1x4_L2 128,32,13,0 + KERNEL1x4_L2 128,32,14,0 + KERNEL1x4_L2 128,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +ZGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_E2 128,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_E2 128,32,3,1 + blr + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,0,0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_L2 64,32,7,0 + KERNEL1x2_L2 64,32,8,0 + KERNEL1x2_L2 64,32,9,0 + KERNEL1x2_L2 64,32,10,0 + KERNEL1x2_L2 64,32,11,0 + KERNEL1x2_L2 64,32,12,0 + KERNEL1x2_L2 64,32,13,0 + KERNEL1x2_L2 64,32,14,0 + KERNEL1x2_L2 64,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN + + +ZGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_E2 64,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_E2 64,32,3,1 + blr + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,0,0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_L2 32,32,7,0 + KERNEL1x1_L2 32,32,8,0 + KERNEL1x1_L2 32,32,9,0 + KERNEL1x1_L2 32,32,10,0 + KERNEL1x1_L2 32,32,11,0 + KERNEL1x1_L2 32,32,12,0 + KERNEL1x1_L2 32,32,13,0 + KERNEL1x1_L2 32,32,14,0 + KERNEL1x1_L2 32,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +ZGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_E2 32,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_E2 32,32,3,1 + blr + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + ZGEMM_L1_BEGIN: - andi. T1, N, 1 - ble ZGEMM_L1_END +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 - mr CO, C - mr AO, A - srawi. I, M, 3 - ble ZGEMM_L1x8_END ZGEMM_L1x8_BEGIN: - - - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 32x */ - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - - -ZGEMM_L1x8_LOOP_START: - - LOAD1x8 0 +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ li T2, 1024 - li T3, 1024+512 - li T4, 2048 - li T5, 2048+512 - mtctr L + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + - MY_ALIGN -ZGEMM_L1x8_LOOP: - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L 128,16,0,0 - KERNEL1x8_L 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L 128,16,2,0 - KERNEL1x8_L 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L 128,16,4,0 - KERNEL1x8_L 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L 128,16,6,0 - KERNEL1x8_L 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L 128,16,8,0 - KERNEL1x8_L 128,16,9,0 - KERNEL1x8_L 128,16,10,0 - KERNEL1x8_L 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L 128,16,12,0 - KERNEL1x8_L 128,16,13,0 - KERNEL1x8_L 128,16,14,0 - KERNEL1x8_L 128,16,15,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: - END1x8 AO, BO, 128,16 - - b ZGEMM_L1x8_SUB1 - ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8O 128,16 + END1x8_WITHOUT_ADD + LOAD1x8_2O 256, 32 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-256 + LOAD1x8_2O 256,32 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x8_SUB2 - -ZGEMM_L1x8_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x8_SAVE ZGEMM_L1x8_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x8_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + bl ZGEMM_1x8_L64_SUB MY_ALIGN -ZGEMM_L1x8_SUB2_LOOP: - LOAD1x8 0 - KERNEL1x8_L 128,16, 0,0 - KERNEL1x8_L 128,16, 1,0 - KERNEL1x8_L 128,16, 2,0 - KERNEL1x8_E 128,16, 3,1 - bdnz ZGEMM_L1x8_SUB2_LOOP - MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + bl ZGEMM_1x8_L32_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + bl ZGEMM_1x8_L16_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_L2 256,32, 1,0 + KERNEL1x8_L2 256,32, 2,0 + KERNEL1x8_E2 256,32, 3,1 + MY_ALIGN + + ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x8_SUB2_2 - LOAD1x8 0 - KERNEL1x8_L 128,16, 0,0 - KERNEL1x8_E 128,16, 1,1 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_E2 256,32, 1,1 MY_ALIGN + + ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x8_SUB2_1 - LOAD1x8 0 - KERNEL1x8_E 128,16, 0,1 + LOAD1x8_2 + KERNEL1x8_E2 256,32, 0,1 MY_ALIGN + + ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN - SAVE1x8 - - addic. I, I, -1 - bgt ZGEMM_L1x8_BEGIN ZGEMM_L1x8_END: +/*----------------------------------------*/ + ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - -ZGEMM_L1x4_LOOP_START: - LOAD1x4 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x4_LOOP: - KERNEL1x4_L 64,16,0,0 - KERNEL1x4_L 64,16,1,0 - KERNEL1x4_L 64,16,2,0 - KERNEL1x4_L 64,16,3,0 - KERNEL1x4_L 64,16,4,0 - KERNEL1x4_L 64,16,5,0 - KERNEL1x4_L 64,16,6,0 - KERNEL1x4_L 64,16,7,0 - KERNEL1x4_L 64,16,8,0 - KERNEL1x4_L 64,16,9,0 - KERNEL1x4_L 64,16,10,0 - KERNEL1x4_L 64,16,11,0 - KERNEL1x4_L 64,16,12,0 - KERNEL1x4_L 64,16,13,0 - KERNEL1x4_L 64,16,14,0 - KERNEL1x4_L 64,16,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN -ZGEMM_L1x4_LOOP_END: - END1x4 AO, BO, 64,16 - - b ZGEMM_L1x4_SUB1 - ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4O 64,16 + END1x4_WITHOUT_ADD + LOAD1x4_2O 128, 32 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD1x4_2O 128,32 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x4_SUB2 - -ZGEMM_L1x4_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE ZGEMM_L1x4_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x4_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + bl ZGEMM_1x4_L16_SUB MY_ALIGN -ZGEMM_L1x4_SUB2_LOOP: - LOAD1x4 0 - KERNEL1x4_L 64,16, 0,0 - KERNEL1x4_L 64,16, 1,0 - KERNEL1x4_L 64,16, 2,0 - KERNEL1x4_E 64,16, 3,1 - bdnz ZGEMM_L1x4_SUB2_LOOP + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + bl ZGEMM_1x4_L8_SUB MY_ALIGN + + ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x4_SUB2_2 - LOAD1x4 0 - KERNEL1x4_L 64,16, 0,0 - KERNEL1x4_E 64,16, 1,1 + LOAD1x4_2 + KERNEL1x4_L2 128,32, 0,0 + KERNEL1x4_E2 128,32, 1,1 MY_ALIGN + + ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x4_SUB2_1 - LOAD1x4 0 - KERNEL1x4_E 64,16, 0,1 + LOAD1x4_2 + KERNEL1x4_E2 128,32, 0,1 MY_ALIGN + + ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif - SAVE1x4 ZGEMM_L1x4_END: +/*----------------------------------------*/ + ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 - andi. T1, M, 2 - ble ZGEMM_L1x2_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - -ZGEMM_L1x2_LOOP_START: - LOAD1x2 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x2_LOOP: - KERNEL1x2_L 32,16,0,0 - KERNEL1x2_L 32,16,1,0 - KERNEL1x2_L 32,16,2,0 - KERNEL1x2_L 32,16,3,0 - KERNEL1x2_L 32,16,4,0 - KERNEL1x2_L 32,16,5,0 - KERNEL1x2_L 32,16,6,0 - KERNEL1x2_L 32,16,7,0 - KERNEL1x2_L 32,16,8,0 - KERNEL1x2_L 32,16,9,0 - KERNEL1x2_L 32,16,10,0 - KERNEL1x2_L 32,16,11,0 - KERNEL1x2_L 32,16,12,0 - KERNEL1x2_L 32,16,13,0 - KERNEL1x2_L 32,16,14,0 - KERNEL1x2_L 32,16,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN -ZGEMM_L1x2_LOOP_END: - END1x2 AO, BO, 32,16 - - b ZGEMM_L1x2_SUB1 - ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2O 32,16 + END1x2_WITHOUT_ADD + LOAD1x2_2O 64, 32 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD1x2_2O 64,32 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x2_SUB2 - -ZGEMM_L1x2_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE ZGEMM_L1x2_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x2_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + bl ZGEMM_1x2_L16_SUB MY_ALIGN -ZGEMM_L1x2_SUB2_LOOP: - LOAD1x2 0 - KERNEL1x2_L 32,16, 0,0 - KERNEL1x2_L 32,16, 1,0 - KERNEL1x2_L 32,16, 2,0 - KERNEL1x2_E 32,16, 3,1 - bdnz ZGEMM_L1x2_SUB2_LOOP + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + bl ZGEMM_1x2_L8_SUB MY_ALIGN + + ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x2_SUB2_2 - LOAD1x2 0 - KERNEL1x2_L 32,16, 0,0 - KERNEL1x2_E 32,16, 1,1 + LOAD1x2_2 + KERNEL1x2_L2 64,32, 0,0 + KERNEL1x2_E2 64,32, 1,1 MY_ALIGN + + ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x2_SUB2_1 - LOAD1x2 0 - KERNEL1x2_E 32,16, 0,1 + LOAD1x2_2 + KERNEL1x2_E2 64,32, 0,1 MY_ALIGN -ZGEMM_L1x2_SUB2_1: - andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 -ZGEMM_L1x2_SAVE: - SAVE1x2 + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + ZGEMM_L1x2_END: +/*----------------------------------------*/ + ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 - andi. T1, M, 1 - ble ZGEMM_L1x1_END - mr BO, B - mr T1, K - addi T1,T1, -1 - srawi. L, T1, 5 /**(K-1) % 16x */ - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - -ZGEMM_L1x1_LOOP_START: - - LOAD1x1 0 - mtctr L - - MY_ALIGN -ZGEMM_L1x1_LOOP: - KERNEL1x1_L 16,16,0,0 - KERNEL1x1_L 16,16,1,0 - KERNEL1x1_L 16,16,2,0 - KERNEL1x1_L 16,16,3,0 - KERNEL1x1_L 16,16,4,0 - KERNEL1x1_L 16,16,5,0 - KERNEL1x1_L 16,16,6,0 - KERNEL1x1_L 16,16,7,0 - KERNEL1x1_L 16,16,8,0 - KERNEL1x1_L 16,16,9,0 - KERNEL1x1_L 16,16,10,0 - KERNEL1x1_L 16,16,11,0 - KERNEL1x1_L 16,16,12,0 - KERNEL1x1_L 16,16,13,0 - KERNEL1x1_L 16,16,14,0 - KERNEL1x1_L 16,16,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN -ZGEMM_L1x1_LOOP_END: - END1x1 AO, BO, 16, 16 - - b ZGEMM_L1x1_SUB1 - ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1O 16,16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD1x1_2O 32,32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN - andi. L, K, 63 - - b ZGEMM_L1x1_SUB2 - -ZGEMM_L1x1_SUB1: - - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE ZGEMM_L1x1_SUB2: - srawi. T1,L, 3 - ble ZGEMM_L1x1_SUB2_4 - mtctr T1 +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x1_SUB2_8 + bl ZGEMM_1x1_L16_SUB MY_ALIGN -ZGEMM_L1x1_SUB2_LOOP: - LOAD1x1 0 - KERNEL1x1_L 16,16, 0,0 - KERNEL1x1_L 16,16, 1,0 - KERNEL1x1_L 16,16, 2,0 - KERNEL1x1_E 16,16, 3,1 - bdnz ZGEMM_L1x1_SUB2_LOOP + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x1_SUB2_4 + bl ZGEMM_1x1_L8_SUB MY_ALIGN + + ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ andi. T1,L, 4 ble ZGEMM_L1x1_SUB2_2 - LOAD1x1 0 - KERNEL1x1_L 16,16, 0,0 - KERNEL1x1_E 16,16, 1,1 + LOAD1x1_2 + KERNEL1x1_L2 32,32, 0,0 + KERNEL1x1_E2 32,32, 1,1 MY_ALIGN + + ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ andi. T1,L, 2 ble ZGEMM_L1x1_SUB2_1 - LOAD1x1 0 - KERNEL1x1_E 16,16, 0,1 + LOAD1x1_2 + KERNEL1x1_E2 32,32, 0,1 MY_ALIGN + + ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif - SAVE1x1 ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + ZGEMM_L1_END: +/*----------------------------------------*/ + \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 10d9e4cc3..8670e9574 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -25,7 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #define unit_size 16 #define DISP32(ind,disp) (ind*unit_size*32+disp) #define DISP16(ind,disp) (ind*unit_size*16+disp) @@ -34,10 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) #define DISPX(disp) (disp) - /* HELPERS FOR SAVE */ - /* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + .macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET #ifndef TRMMKERNEL lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) @@ -46,20 +45,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 #endif .endm - /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ .endm - /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ .endm - /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + .macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI #if defined(NN) || defined(NT) || defined(TN) || defined(TT) xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR @@ -78,8 +80,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI #endif .endm - /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + .macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 #ifndef TRMMKERNEL xvmsubadp \VSOUT1,\VSINII, alpha_i @@ -89,23 +92,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp \VSOUT2,\VSINRR, alpha_i #endif .endm - /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + .macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 xvmsubadp \VSOUT1,\VSINRR, alpha_r xvmaddadp \VSOUT2,\VSINII, alpha_r .endm - /* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 xxmrghd \VSOUT1,\VSIN2,\VSIN1 xxmrgld \VSOUT2,\VSIN2,\VSIN1 .endm + + .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 stxv \VSIN1, DISPX(\LOFFSET)(\REG) stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) .endm + .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -141,6 +149,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 .endm + .macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -161,6 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + .macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET @@ -173,6 +183,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm + .macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 #ifndef TRMMKERNEL @@ -188,9 +199,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxmrghd vs7,vs15,vs14 stxv vs7, (\LOFFSET)(\BASE_REG) .endm - /********************************************************************************************** -* Macros for N=2 and M=8 +* + +.macros for N=2 and M=8 **********************************************************************************************/ .macro Zero2x8 @@ -228,269 +240,272 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs63, vs63, vs63 .endm -.macro LOAD2x8 Zero - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A - -.if \Zero==1 - Zero2x8 -.endif - + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + .endm + .macro END2x8_NORMAL END2x8 AO,BO,128,32 .endm -.macro END2x8 AREG, BREG, OffsetA, OffsetB +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs48, vs0, vs18 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - - xvmaddadp vs33, vs0, vs17 xvmaddadp vs49, vs0, vs19 - + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 xvmaddadp vs35, vs1, vs17 xvmaddadp vs51, vs1, vs19 - + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 xvmaddadp vs37, vs2, vs17 xvmaddadp vs53, vs2, vs19 - + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 xvmaddadp vs39, vs3, vs17 xvmaddadp vs55, vs3, vs19 - + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 xvmaddadp vs41, vs4, vs17 xvmaddadp vs57, vs4, vs19 - + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 xvmaddadp vs43, vs5, vs17 xvmaddadp vs59, vs5, vs19 - + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 xvmaddadp vs45, vs6, vs17 xvmaddadp vs61, vs6, vs19 - + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 xvmaddadp vs47, vs7, vs17 xvmaddadp vs63, vs7, vs19 - .endm -.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x8_2 + /*for load2 offset will be 256 and 64*/ + KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 +.endm + + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm .macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B xvmaddadp vs32, vs0, vs16 xvmaddadp vs48, vs0, vs18 xvmaddadp vs33, vs0, vs17 xvmaddadp vs49, vs0, vs19 - - xxswapd vs21, vs20 - xxswapd vs23, vs22 - + xxswapd vs21, vs20 + xxswapd vs23, vs22 xvmaddadp vs34, vs1, vs16 xvmaddadp vs50, vs1, vs18 - - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs35, vs1, vs17 xvmaddadp vs51, vs1, vs19 - - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs52, vs2, vs18 - - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs37, vs2, vs17 xvmaddadp vs53, vs2, vs19 - - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif - - xvmaddadp vs38, vs3, vs16 xvmaddadp vs54, vs3, vs18 - -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - xvmaddadp vs39, vs3, vs17 xvmaddadp vs55, vs3, vs19 - -.if \Complete==0 - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs4, vs16 xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 xvmaddadp vs58, vs5, vs18 xvmaddadp vs43, vs5, vs17 xvmaddadp vs59, vs5, vs19 - -.if \Complete==0 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs6, vs16 xvmaddadp vs60, vs6, vs18 xvmaddadp vs45, vs6, vs17 xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 xvmaddadp vs62, vs7, vs18 xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 - -.if \Complete==0 - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs63, vs7, vs19 +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs48, vs8, vs22 .if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 -.if \IsLast==1 - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif - -.endif + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs33, vs8, vs21 xvmaddadp vs49, vs8, vs23 - -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 .endif - xvmaddadp vs34, vs9, vs20 xvmaddadp vs50, vs9, vs22 xvmaddadp vs35, vs9, vs21 xvmaddadp vs51, vs9, vs23 - +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs52, vs10, vs22 xvmaddadp vs37, vs10, vs21 xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 xvmaddadp vs54, vs11, vs22 xvmaddadp vs39, vs11, vs21 xvmaddadp vs55, vs11, vs23 - +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs12, vs20 xvmaddadp vs56, vs12, vs22 xvmaddadp vs41, vs12, vs21 xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 xvmaddadp vs58, vs13, vs22 xvmaddadp vs43, vs13, vs21 xvmaddadp vs59, vs13, vs23 - +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs14, vs20 xvmaddadp vs60, vs14, vs22 xvmaddadp vs45, vs14, vs21 xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 xvmaddadp vs62, vs15, vs22 xvmaddadp vs47, vs15, vs21 xvmaddadp vs63, vs15, vs23 - +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif .endm + + + + .macro KERNEL2x8 - LOAD2x8 0 + LOAD2x8 END2x8 AO, BO, 128,32 .endm -.macro SAVE2x8 +.macro SAVE2x8 add T1, CO ,LDC SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 addi CO, CO, 128 - .endm - /********************************************************************************************** -* Macros for N=2 and M=4 +* + +.macros for N=2 and M=4 **********************************************************************************************/ + .macro Zero2x4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -510,167 +525,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs47, vs47, vs47 .endm -.macro LOAD2x4 Zero - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - -.if \Zero==1 - Zero2x4 -.endif - + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A .endm + .macro END2x4_NORMAL END2x4 AO,BO,64,32 .endm -.macro END2x4 AREG, BREG, OffsetA, OffsetB +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 xvmaddadp vs47, vs3, vs19 .endm -.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm -.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs41, vs0, vs19 - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) -.endif -.endif - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs43, vs1, vs19 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs45, vs2, vs19 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs47, vs3, vs19 +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm -.if \Complete==0 - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A - -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - -.if \Complete==0 +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.endif - - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs41, vs8, vs23 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs43, vs9, vs23 - - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs45, vs10, vs23 - - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs47, vs11, vs23 +.macro END2x4_2 + /*for load2 offset will be 128 and 64*/ + KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 .endm + + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs41, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs43, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs47, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + .macro KERNEL2x4 - LOAD2x4 0 + LOAD2x4 END2x4 AO, BO, 64,32 .endm + + .macro SAVE2x4 add T1, CO ,LDC SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 addi CO, CO, 64 - .endm - /********************************************************************************************** -* Macros for N=2 and M=2 +* + +.macros for N=2 and M=2 **********************************************************************************************/ + .macro Zero2x2 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -680,231 +727,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs37, vs37, vs37 xxlxor vs38, vs38, vs38 xxlxor vs39, vs39, vs39 + .endm -.macro LOAD2x2 Zero - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 - - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - - -.if \Zero==1 - Zero2x2 -.endif + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + .endm + .macro END2x2_NORMAL END2x2 AO,BO,32,32 .endm -.macro END2x2 AREG, BREG, OffsetA, OffsetB +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 xvmaddadp vs36, vs0, vs18 - xvmaddadp vs37, vs0, vs19 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs39, vs1, vs19 - -.endm - -.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif - xvmaddadp vs36, vs0, vs18 xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 xvmaddadp vs38, vs1, vs18 - xvmaddadp vs39, vs1, vs19 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif +.endm - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.endif - xvmaddadp vs36, vs8, vs22 - xvmaddadp vs37, vs8, vs23 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A + +.endm - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 - - xvmaddadp vs38, vs9, vs22 - xvmaddadp vs39, vs9, vs23 +.macro END2x2_2 + /*for load2 offset will be 64 and 64*/ + KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 .endm + + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs37, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs39, vs9, vs23 +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + .macro KERNEL2x2 - LOAD2x2 0 + LOAD2x2 END2x2 AO, BO, 32,32 .endm + + .macro SAVE2x2 add T1, CO ,LDC SAVE2 vs32,vs33,vs34,vs35,CO,0 SAVE2 vs36,vs37,vs38,vs39,T1,0 addi CO, CO, 32 .endm - /********************************************************************************************** -* Macros for N=2 and M=1 +* + +.macros for N=2 and M=1 **********************************************************************************************/ + + .macro Zero2x1 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 xxlxor vs35, vs35, vs35 + .endm -.macro LOAD2x1 Zero - lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real imag from B - lxv vs18, 16(BO) // load real,imag from B +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.if \Zero==1 - Zero2x1 -.endif + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A .endm + .macro END2x1_NORMAL END2x1 AO,BO,16,32 .endm -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif - +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs35, vs0, vs19 - -.endm - -.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - -.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B - - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - - xxswapd vs21, vs20 - xxswapd vs23, vs22 -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) -.endif -.endif - - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.endm - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs35, vs0, vs19 -.if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.if \IsLast==1 - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif - -.if \Complete==0 + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B xxswapd vs17, vs16 xxswapd vs19, vs18 -.endif + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 - - xvmaddadp vs34, vs8, vs22 - xvmaddadp vs35, vs8, vs23 +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 .endm + + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs35, vs8, vs23 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + .macro KERNEL2x1 - LOAD2x1 0 + LOAD2x1 END2x1 AO, BO, 16,32 .endm + + .macro SAVE2x1 add T1, CO ,LDC SAVE1 vs32,vs33,CO,0 @@ -913,8 +1028,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm /********************************************************************************************** -* Macros for N=1 and M=8 +* + +.macros for N=1 and M=8 **********************************************************************************************/ + + .macro Zero1x8 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -932,167 +1051,228 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 .endm -.macro LOAD1x8 Zero - - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - - lxv vs4, 64(AO) // load real,imag from A - lxv vs5, 80(AO) // load real,imag from A - lxv vs6, 96(AO) // load real,imag from A - lxv vs7, 112(AO) // load real,imag from A - -.if \Zero==1 - Zero1x8 -.endif +.macro LOAD1x8 + LOAD1x8O 0,0 .endm + +.macro LOAD1x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + .macro END1x8_NORMAL END1x8 AO,BO,128,16 .endm -.macro END1x8 AREG, BREG, OffsetA, OffsetB +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 + xvmaddadp vs40, vs4, vs16 xvmaddadp vs41, vs4, vs17 + xvmaddadp vs42, vs5, vs16 xvmaddadp vs43, vs5, vs17 + xvmaddadp vs44, vs6, vs16 xvmaddadp vs45, vs6, vs17 + xvmaddadp vs46, vs7, vs16 xvmaddadp vs47, vs7, vs17 .endm -.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x8_2 + /*for load2 offset will be 256 and 32*/ + KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 +.endm + + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21, vs20 - - - lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 - lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 - lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A .endif xvmaddadp vs40, vs4, vs16 xvmaddadp vs41, vs4, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif + xvmaddadp vs42, vs5, vs16 xvmaddadp vs43, vs5, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs6, vs16 xvmaddadp vs45, vs6, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif + xvmaddadp vs46, vs7, vs16 xvmaddadp vs47, vs7, vs17 - - +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A -.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A .endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 xvmaddadp vs39, vs11, vs21 - +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs40, vs12, vs20 xvmaddadp vs41, vs12, vs21 xvmaddadp vs42, vs13, vs20 xvmaddadp vs43, vs13, vs21 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs44, vs14, vs20 xvmaddadp vs45, vs14, vs21 xvmaddadp vs46, vs15, vs20 xvmaddadp vs47, vs15, vs21 - +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + + + .macro KERNEL1x8 - LOAD1x8 0 + LOAD1x8 END1x8 AO, BO, 128,16 .endm + .macro SAVE1x8 - - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 addi CO, CO, 128 - .endm - /********************************************************************************************** -* Macros for N=1 and M=4 +* + +.macros for N=2 and M=4 **********************************************************************************************/ + .macro Zero1x4 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 @@ -1104,324 +1284,542 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs39, vs39, vs39 .endm -.macro LOAD1x4 Zero - - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17,vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - lxv vs2, 32(AO) // load real,imag from A - lxv vs3, 48(AO) // load real,imag from A - -.if \Zero==1 - Zero1x4 -.endif +.macro LOAD1x4 + LOAD1x4O 0,0 .endm + +.macro LOAD1x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + .macro END1x4_NORMAL END1x4 AO,BO,64,16 .endm -.macro END1x4 AREG, BREG, OffsetA, OffsetB +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 + xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 .endm -.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x4_2 + /*for load2 offset will be 128 and 32*/ + KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 +.endm + + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21,vs20 - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - - lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A - + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 + xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs47, vs3, vs19 - -.if \Complete==0 - lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif .if \Complete==0 - lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif xvmaddadp vs36, vs10, vs20 xvmaddadp vs37, vs10, vs21 xvmaddadp vs38, vs11, vs20 xvmaddadp vs39, vs11, vs21 - - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs41, vs8, vs23 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs43, vs9, vs23 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs47, vs11, vs23 - +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + .macro KERNEL1x4 - LOAD1x4 0 + LOAD1x4 END1x4 AO, BO, 64,16 .endm -.macro SAVE1x4 + + +.macro SAVE1x4 SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 addi CO, CO, 64 - .endm - /********************************************************************************************** -* Macros for N=1 and M=2 +* + +.macros for N=2 and M=2 **********************************************************************************************/ + .macro Zero1x2 xxlxor vs32, vs32, vs32 xxlxor vs33, vs33, vs33 xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 -.endm - -.macro LOAD1x2 Zero - - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17,vs16 - lxv vs0, 0(AO) // load real,imag from A - lxv vs1, 16(AO) // load real,imag from A - -.if \Zero==1 - Zero1x2 -.endif + xxlxor vs35, vs35, vs35 .endm + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + .macro END1x2_NORMAL END1x2 AO,BO,32,16 .endm -.macro END1x2 AREG, BREG, OffsetA, OffsetB +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 + xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 .endm -.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x2_2 + /*for load2 offset will be 64 and 32*/ + KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 +.endm + + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21,vs20 - xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 - - lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A - + xxswapd vs21, vs20 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17,vs16 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,32+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif - xvmaddadp vs32, vs8, vs20 xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif xvmaddadp vs34, vs9, vs20 xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif .endm + + .macro KERNEL1x2 - LOAD1x2 0 + LOAD1x2 END1x2 AO, BO, 32,16 .endm -.macro SAVE1x2 + + +.macro SAVE1x2 SAVE2 vs32,vs33,vs34,vs35,CO,0 addi CO, CO, 32 .endm - /********************************************************************************************** -* Macros for N=1 and M=1 +* + +.macros for N=2 and M=1 **********************************************************************************************/ + + .macro Zero1x1 xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 + xxlxor vs33, vs33, vs33 .endm -.macro LOAD1x1 Zero - lxv vs0, 0(AO) // load real,imag from A - lxv vs16, 0(BO) // load real imag from B - xxswapd vs17, vs16 -.if \Zero==1 - Zero1x1 -.endif - +.macro LOAD1x1 + LOAD1x1O 0,0 .endm + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs17, vs16 + +.endm + + .macro END1x1_NORMAL END1x1 AO,BO,16,16 .endm -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB .if \OffsetB != 0 addi \BREG, \BREG, \OffsetB .endif - - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 .endm -.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 .endm -.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 .endm + .macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B - xxswapd vs21, vs20 - - lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + xxswapd vs21, vs20 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - -.if \Complete==0 - lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B .endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B - xxswapd vs17, vs16 -.endif - .if \IsLast==1 .if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP2(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) .else - addi \AREG, \AREG, DISP2(\Index,32) + addi \AREG, \AREG, DISP2(\Index,32) addi \BREG, \BREG, DISP2(\Index,32) .endif -.endif - - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 - - +.endif .endm + + .macro KERNEL1x1 - LOAD1x1 0 + LOAD1x1 END1x1 AO, BO, 16,16 - .endm -.macro SAVE1x1 + + +.macro SAVE1x1 SAVE1 vs32,vs33,CO,0 addi CO, CO, 16 .endm +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/param.h b/param.h index 8f78a6a64..9a1a68ecd 100644 --- a/param.h +++ b/param.h @@ -2256,7 +2256,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 1025 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 1025 +#define ZGEMM_DEFAULT_Q 1026 #define SYMV_P 8