From 8310d4d3f7e5258965991e9df252fab654d7d368 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 16 May 2016 14:14:25 +0200 Subject: [PATCH] optimized dgemm for 20 threads --- Makefile.power | 4 +- common_power.h | 2 +- kernel/power/dgemm_logic_16x4_power8.S | 158 ++++++++++++++++++++++-- kernel/power/dgemm_macros_16x4_power8.S | 74 ++++++----- param.h | 6 +- 5 files changed, 191 insertions(+), 53 deletions(-) diff --git a/Makefile.power b/Makefile.power index 48bcb77f8..589d67441 100644 --- a/Makefile.power +++ b/Makefile.power @@ -13,10 +13,10 @@ endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math endif endif diff --git a/common_power.h b/common_power.h index b62aca303..e3a1a7aef 100644 --- a/common_power.h +++ b/common_power.h @@ -803,7 +803,7 @@ Lmcount$lazy_ptr: #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) #elif defined(POWER8) -#define BUFFER_SIZE ( 32 << 20) +#define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) #endif diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S index 718f80bdd..edfcc4bcc 100644 --- a/kernel/power/dgemm_logic_16x4_power8.S +++ b/kernel/power/dgemm_logic_16x4_power8.S @@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDGEMM_L4_BEGIN: - mr CO, C + li T1, 128 + li T2, 256 mr AO, A - slwi T1, LDC , 2 - add C, C, T1 + + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 + srawi. I, M, 4 ble LDGEMM_L4x16_END + .align 4 +LDGEMM_L4x16_BEGIN_FIRST: + + li L, -128 + + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + mr BO, B + srawi. L, K, 2 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + ble LDGEMM_L4x16_SUB0_FIRST + cmpwi cr0, L, 1 + ble LDGEMM_L4x16_SUB4_FIRST + + .align 4 +LDGEMM_L4x16_LOOP_START_FIRST: + + li T2, 512 + li o40, 40 + li o56, 56 + + dcbt AO, PRE + dcbt BO, T2 + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + addic. L, L, -2 + KERNEL4x16_L2 + + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + dcbt BO, T2 + KERNEL4x16_L2 + + ble LDGEMM_L4x16_LOOP_END_FIRST + mtctr L + + .align 4 + +LDGEMM_L4x16_LOOP_FIRST: + + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + KERNEL4x16_L2 + + dcbt AO, PRE + KERNEL4x16_L1 + dcbt AO, PRE + dcbt BO, T2 + KERNEL4x16_L2 + + bdnz LDGEMM_L4x16_LOOP_FIRST + + .align 4 + +LDGEMM_L4x16_LOOP_END_FIRST: + + KERNEL4x16_L1 + KERNEL4x16_L2 + + KERNEL4x16_1 + KERNEL4x16_E2 + + b LDGEMM_L4x16_SUB1_FIRST + +LDGEMM_L4x16_SUB4_FIRST: + + KERNEL4x16_SUBI1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b LDGEMM_L4x16_SUB1_FIRST + +LDGEMM_L4x16_SUB0_FIRST: + + andi. L, K, 3 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE_FIRST + b LDGEMM_L4x16_SUB2_FIRST + +LDGEMM_L4x16_SUB1_FIRST: + + andi. L, K, 3 + ble LDGEMM_L4x16_SAVE_FIRST + +LDGEMM_L4x16_SUB2_FIRST: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x16_SUB2_FIRST + + .align 4 +LDGEMM_L4x16_SAVE_FIRST: + + SAVE4x16 + + addic. I, I, -1 + ble LDGEMM_L4x16_END + +LDGEMM_L4x16_END_FIRST: + .align 4 LDGEMM_L4x16_BEGIN: @@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN: dcbt T3, r0 dcbt T4, r0 - ble LDGEMM_L4x16_SUB0 + ble- LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble LDGEMM_L4x16_SUB4 + ble- LDGEMM_L4x16_SUB4 .align 4 LDGEMM_L4x16_LOOP_START: @@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START: addic. L, L, -2 KERNEL4x16_L2 - ble LDGEMM_L4x16_LOOP_END + ble- LDGEMM_L4x16_LOOP_END + mtctr L .align 4 @@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE - addic. L, L, -1 + // addic. L, L, -1 KERNEL4x16_L2 - bgt LDGEMM_L4x16_LOOP + bdnz+ LDGEMM_L4x16_LOOP .align 4 @@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt LDGEMM_L4x16_BEGIN + bgt+ LDGEMM_L4x16_BEGIN LDGEMM_L4x16_END: diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 2c7851207..5be517f7c 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x16 - mr T1, CO - add T2, T1, LDC - add T3, T2, LDC - add T4, T3, LDC + add T2, CO, LDC lxvd2x vs0, 0, CO lxvd2x vs1, o16, CO @@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs3, o48, CO lxvd2x vs4, o64, CO lxvd2x vs5, o80, CO + add T3, T2, LDC lxvd2x vs6, o96, CO lxvd2x vs7, o112, CO @@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs11, o48, T2 lxvd2x vs12, o64, T2 lxvd2x vs13, o80, T2 + add T4, T3, LDC lxvd2x vs14, o96, T2 lxvd2x vs15, o112, T2 @@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs31, o112, T3 xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r - lxvd2x vs32, 0, T4 + xvmaddadp vs1, vs33, alpha_r lxvd2x vs33, o16, T4 + xvmaddadp vs2, vs34, alpha_r lxvd2x vs34, o32, T4 + xvmaddadp vs3, vs35, alpha_r lxvd2x vs35, o48, T4 + xvmaddadp vs4, vs36, alpha_r lxvd2x vs36, o64, T4 + xvmaddadp vs5, vs37, alpha_r lxvd2x vs37, o80, T4 + xvmaddadp vs6, vs38, alpha_r lxvd2x vs38, o96, T4 + xvmaddadp vs7, vs39, alpha_r lxvd2x vs39, o112, T4 xvmaddadp vs8, vs40, alpha_r @@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r - stxvd2x vs4, o64, T1 - stxvd2x vs5, o80, T1 - stxvd2x vs6, o96, T1 - stxvd2x vs7, o112, T1 - xvmaddadp vs24, vs48, alpha_r xvmaddadp vs25, vs49, alpha_r xvmaddadp vs26, vs50, alpha_r xvmaddadp vs27, vs51, alpha_r - stxvd2x vs8, o0, T2 - stxvd2x vs9, o16, T2 - stxvd2x vs10, o32, T2 - stxvd2x vs11, o48, T2 - xvmaddadp vs28, vs52, alpha_r xvmaddadp vs29, vs53, alpha_r xvmaddadp vs30, vs54, alpha_r xvmaddadp vs31, vs55, alpha_r - stxvd2x vs12, o64, T2 - stxvd2x vs13, o80, T2 - stxvd2x vs14, o96, T2 - stxvd2x vs15, o112, T2 + stxvd2x vs0, 0, CO + stxvd2x vs1, o16, CO + stxvd2x vs2, o32, CO + stxvd2x vs3, o48, CO + + stxvd2x vs4, o64, CO + stxvd2x vs5, o80, CO + stxvd2x vs6, o96, CO + stxvd2x vs7, o112, CO xvmaddadp vs32, vs56, alpha_r xvmaddadp vs33, vs57, alpha_r xvmaddadp vs34, vs58, alpha_r xvmaddadp vs35, vs59, alpha_r - stxvd2x vs24, 0, T3 - stxvd2x vs25, o16, T3 - stxvd2x vs26, o32, T3 - stxvd2x vs27, o48, T3 - xvmaddadp vs36, vs60, alpha_r xvmaddadp vs37, vs61, alpha_r xvmaddadp vs38, vs62, alpha_r xvmaddadp vs39, vs63, alpha_r + addi CO, CO, 128 + + stxvd2x vs8, o0, T2 + stxvd2x vs9, o16, T2 + stxvd2x vs10, o32, T2 + stxvd2x vs11, o48, T2 + + stxvd2x vs12, o64, T2 + stxvd2x vs13, o80, T2 + stxvd2x vs14, o96, T2 + stxvd2x vs15, o112, T2 + + stxvd2x vs24, 0, T3 + stxvd2x vs25, o16, T3 stxvd2x vs28, o64, T3 stxvd2x vs29, o80, T3 + + stxvd2x vs26, o32, T3 + stxvd2x vs27, o48, T3 stxvd2x vs30, o96, T3 stxvd2x vs31, o112, T3 @@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T4 stxvd2x vs35, o48, T4 - addi CO, CO, 128 - stxvd2x vs36, o64, T4 stxvd2x vs37, o80, T4 stxvd2x vs38, o96, T4 diff --git a/param.h b/param.h index 9046c33d7..489127d2d 100644 --- a/param.h +++ b/param.h @@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 4096 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_P 320 #define SGEMM_DEFAULT_Q 640 -#define DGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640