From 6a2bde7a2de5a0dc5ae95d6b78884e53426129ad Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 17 May 2016 14:45:27 +0200 Subject: [PATCH] optimized dgemm and dgetrf for POWER8 --- common.h | 7 +++ kernel/power/dgemm_logic_16x4_power8.S | 63 ++++++++++++--------- kernel/power/dgemm_ncopy_macros_4_power8.S | 7 +++ kernel/power/dgemm_tcopy_16_power8.S | 2 +- kernel/power/dgemm_tcopy_logic_16_power8.S | 24 ++++---- kernel/power/dgemm_tcopy_macros_16_power8.S | 40 ++++++------- lapack/getrf/getrf_parallel_omp.c | 9 ++- 7 files changed, 90 insertions(+), 62 deletions(-) diff --git a/common.h b/common.h index c6f7ea2fd..a7342db2c 100644 --- a/common.h +++ b/common.h @@ -332,6 +332,13 @@ typedef int blasint; #endif #endif +#ifdef POWER8 +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + + /* #ifdef PILEDRIVER #ifndef YIELDING diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index edfcc4bcc..cacfab1f6 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * LAPACK-TEST : OK **************************************************************************************/ +#define MY_ALIGN .align 3 srawi. J, N, 2 ble LDGEMM_L4_END @@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN: srawi. I, M, 4 ble LDGEMM_L4x16_END - .align 4 + MY_ALIGN LDGEMM_L4x16_BEGIN_FIRST: li L, -128 @@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST: cmpwi cr0, L, 1 ble LDGEMM_L4x16_SUB4_FIRST - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP_START_FIRST: li T2, 512 @@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST: ble LDGEMM_L4x16_LOOP_END_FIRST mtctr L - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP_FIRST: @@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST: bdnz LDGEMM_L4x16_LOOP_FIRST - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP_END_FIRST: @@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST: addic. L, L, -1 bgt LDGEMM_L4x16_SUB2_FIRST - .align 4 + MY_ALIGN LDGEMM_L4x16_SAVE_FIRST: SAVE4x16 @@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST: LDGEMM_L4x16_END_FIRST: - .align 4 + MY_ALIGN + LDGEMM_L4x16_BEGIN: li L, -128 @@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN: cmpwi cr0, L, 1 ble- LDGEMM_L4x16_SUB4 - .align 4 + MY_ALIGN + LDGEMM_L4x16_LOOP_START: li o40, 40 @@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START: ble- LDGEMM_L4x16_LOOP_END mtctr L - .align 4 + MY_ALIGN LDGEMM_L4x16_LOOP: - dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE - // addic. L, L, -1 KERNEL4x16_L2 bdnz+ LDGEMM_L4x16_LOOP - .align 4 + + MY_ALIGN LDGEMM_L4x16_LOOP_END: @@ -261,6 +263,8 @@ LDGEMM_L4x16_LOOP_END: b LDGEMM_L4x16_SUB1 + MY_ALIGN + LDGEMM_L4x16_SUB4: KERNEL4x16_SUBI1 @@ -268,6 +272,8 @@ LDGEMM_L4x16_SUB4: b LDGEMM_L4x16_SUB1 + MY_ALIGN + LDGEMM_L4x16_SUB0: andi. L, K, 1 @@ -278,11 +284,15 @@ LDGEMM_L4x16_SUB0: ble LDGEMM_L4x16_SAVE b LDGEMM_L4x16_SUB2 + MY_ALIGN + LDGEMM_L4x16_SUB1: andi. L, K, 1 ble LDGEMM_L4x16_SAVE + MY_ALIGN + LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 @@ -290,7 +300,8 @@ LDGEMM_L4x16_SUB2: addic. L, L, -1 bgt LDGEMM_L4x16_SUB2 - .align 4 + MY_ALIGN + LDGEMM_L4x16_SAVE: SAVE4x16 @@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x8_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x8_LOOP: @@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x4_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x4_LOOP: @@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x2_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x2_LOOP: @@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START: addic. L, L, -2 ble LDGEMM_L4x1_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L4x1_LOOP: @@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x16_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x16_LOOP: @@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x8_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x8_LOOP: @@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x4_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x4_LOOP: @@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x2_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x2_LOOP: @@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START: addic. L, L, -2 ble LDGEMM_L2x1_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L2x1_LOOP: @@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x16_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x16_LOOP: @@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x8_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x8_LOOP: @@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x4_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x4_LOOP: @@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x2_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x2_LOOP: @@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START: addic. L, L, -2 ble LDGEMM_L1x1_LOOP_END - .align 5 + MY_ALIGN LDGEMM_L1x1_LOOP: diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S index 9b07d73f5..fafb09877 100644 --- a/kernel/power/dgemm_ncopy_macros_4_power8.S +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs62, vs7, vs15, 3 xxpermdi vs63, vs23, vs31, 3 + dcbt BO, PREB stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO @@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs39, o112, BO addi BO, BO, 128 + dcbt BO, PREB + stxvd2x vs40, o0, BO stxvd2x vs41, o16, BO stxvd2x vs42, o32, BO @@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs47, o112, BO addi BO, BO, 128 + dcbt BO, PREB + stxvd2x vs48, o0, BO stxvd2x vs49, o16, BO stxvd2x vs50, o32, BO @@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs55, o112, BO addi BO, BO, 128 + dcbt BO, PREB + stxvd2x vs56, o0, BO stxvd2x vs57, o16, BO stxvd2x vs58, o32, BO diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index eca78bac4..eb37877e0 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add B2, B2, B add B1, B1, B - li PREA, 256 + li PREA, 384 addi PREB, M16, 128 li o8, 8 diff --git a/kernel/power/dgemm_tcopy_logic_16_power8.S b/kernel/power/dgemm_tcopy_logic_16_power8.S index 28fc74793..3c34a6167 100644 --- a/kernel/power/dgemm_tcopy_logic_16_power8.S +++ b/kernel/power/dgemm_tcopy_logic_16_power8.S @@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN: ble DCOPYT_L4x8_BEGIN mr BO, B16 + addi T2, M16, 384 + mtctr J .align 5 DCOPYT_L4x16_LOOP: -/* - addi T1, PREB, 128 - addi T2, PREB, 256 -*/ + addi T1, M16, 256 + dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA -/* - dcbtst BO, M16 - dcbtst BO, PREB - dcbtst BO, T1 - dcbtst BO, T2 -*/ + + dcbt BO, M16 + dcbt BO, PREB + dcbt BO, T1 + dcbt BO, T2 + COPY_4x16 add BO, BO, M16 - addic. J, J, -1 - bgt DCOPYT_L4x16_LOOP + // addic. J, J, -1 + bdnz+ DCOPYT_L4x16_LOOP DCOPYT_L4x8_BEGIN: diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index aef03d7cf..333e23105 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -46,52 +46,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs35, o48, A0 addi A0, A0, 64 - lxvd2x vs36, o0, A0 - lxvd2x vs37, o16, A0 - lxvd2x vs38, o32, A0 - lxvd2x vs39, o48, A0 - addi A0, A0, 64 - - lxvd2x vs40, o0, A1 lxvd2x vs41, o16, A1 lxvd2x vs42, o32, A1 lxvd2x vs43, o48, A1 addi A1, A1, 64 - lxvd2x vs44, o0, A1 - lxvd2x vs45, o16, A1 - lxvd2x vs46, o32, A1 - lxvd2x vs47, o48, A1 - addi A1, A1, 64 - - lxvd2x vs48, o0, A2 lxvd2x vs49, o16, A2 lxvd2x vs50, o32, A2 lxvd2x vs51, o48, A2 addi A2, A2, 64 - lxvd2x vs52, o0, A2 - lxvd2x vs53, o16, A2 - lxvd2x vs54, o32, A2 - lxvd2x vs55, o48, A2 - addi A2, A2, 64 - - lxvd2x vs56, o0, A3 lxvd2x vs57, o16, A3 lxvd2x vs58, o32, A3 lxvd2x vs59, o48, A3 addi A3, A3, 64 + lxvd2x vs36, o0, A0 + lxvd2x vs37, o16, A0 + lxvd2x vs38, o32, A0 + lxvd2x vs39, o48, A0 + addi A0, A0, 64 + + lxvd2x vs44, o0, A1 + lxvd2x vs45, o16, A1 + lxvd2x vs46, o32, A1 + lxvd2x vs47, o48, A1 + addi A1, A1, 64 + + lxvd2x vs52, o0, A2 + lxvd2x vs53, o16, A2 + lxvd2x vs54, o32, A2 + lxvd2x vs55, o48, A2 + addi A2, A2, 64 + lxvd2x vs60, o0, A3 lxvd2x vs61, o16, A3 lxvd2x vs62, o32, A3 lxvd2x vs63, o48, A3 addi A3, A3, 64 - mr T1, BO stxvd2x vs32, o0, T1 diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index 7e2319718..6b8cbda2f 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; - if (blocking <= GEMM_UNROLL_N * 2) { +#ifdef POWER8 + if (blocking <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } +#else + if (blocking <= GEMM_UNROLL_N*2) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } +#endif sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);