Merge pull request #876 from wernsaar/develop

optimized dgemm on power8 for 20 threads
This commit is contained in:
Werner Saar 2016-05-16 14:52:40 +02:00
commit 88011f625d
5 changed files with 191 additions and 53 deletions

View File

@ -13,10 +13,10 @@ endif
ifeq ($(CORE), POWER8) ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else else
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
endif endif
endif endif

View File

@ -803,7 +803,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2) #elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8) #elif defined(POWER8)
#define BUFFER_SIZE ( 32 << 20) #define BUFFER_SIZE ( 64 << 20)
#else #else
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
#endif #endif

View File

@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDGEMM_L4_BEGIN: LDGEMM_L4_BEGIN:
mr CO, C li T1, 128
li T2, 256
mr AO, A mr AO, A
slwi T1, LDC , 2
add C, C, T1 mr CO, C
slwi T3, LDC , 2
add C, C, T3
dcbt A, T1
dcbt A, T2
srawi. I, M, 4 srawi. I, M, 4
ble LDGEMM_L4x16_END ble LDGEMM_L4x16_END
.align 4
LDGEMM_L4x16_BEGIN_FIRST:
li L, -128
mr T1, CO
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
and T1, T1, L
and T2, T2, L
and T3, T3, L
and T4, T4, L
dcbt T1, r0
dcbt T2, r0
dcbt T3, r0
dcbt T4, r0
mr BO, B
srawi. L, K, 2
addi T1, T1, 128
addi T2, T2, 128
addi T3, T3, 128
addi T4, T4, 128
dcbt T1, r0
dcbt T2, r0
dcbt T3, r0
dcbt T4, r0
ble LDGEMM_L4x16_SUB0_FIRST
cmpwi cr0, L, 1
ble LDGEMM_L4x16_SUB4_FIRST
.align 4
LDGEMM_L4x16_LOOP_START_FIRST:
li T2, 512
li o40, 40
li o56, 56
dcbt AO, PRE
dcbt BO, T2
LOAD4x16_1
dcbt AO, PRE
KERNEL4x16_I1
dcbt AO, PRE
addic. L, L, -2
KERNEL4x16_L2
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
dcbt BO, T2
KERNEL4x16_L2
ble LDGEMM_L4x16_LOOP_END_FIRST
mtctr L
.align 4
LDGEMM_L4x16_LOOP_FIRST:
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
KERNEL4x16_L2
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
dcbt BO, T2
KERNEL4x16_L2
bdnz LDGEMM_L4x16_LOOP_FIRST
.align 4
LDGEMM_L4x16_LOOP_END_FIRST:
KERNEL4x16_L1
KERNEL4x16_L2
KERNEL4x16_1
KERNEL4x16_E2
b LDGEMM_L4x16_SUB1_FIRST
LDGEMM_L4x16_SUB4_FIRST:
KERNEL4x16_SUBI1
KERNEL4x16_SUB1
KERNEL4x16_SUB1
KERNEL4x16_SUB1
b LDGEMM_L4x16_SUB1_FIRST
LDGEMM_L4x16_SUB0_FIRST:
andi. L, K, 3
KERNEL4x16_SUBI1
addic. L, L, -1
ble LDGEMM_L4x16_SAVE_FIRST
b LDGEMM_L4x16_SUB2_FIRST
LDGEMM_L4x16_SUB1_FIRST:
andi. L, K, 3
ble LDGEMM_L4x16_SAVE_FIRST
LDGEMM_L4x16_SUB2_FIRST:
KERNEL4x16_SUB1
addic. L, L, -1
bgt LDGEMM_L4x16_SUB2_FIRST
.align 4
LDGEMM_L4x16_SAVE_FIRST:
SAVE4x16
addic. I, I, -1
ble LDGEMM_L4x16_END
LDGEMM_L4x16_END_FIRST:
.align 4 .align 4
LDGEMM_L4x16_BEGIN: LDGEMM_L4x16_BEGIN:
@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN:
dcbt T3, r0 dcbt T3, r0
dcbt T4, r0 dcbt T4, r0
ble LDGEMM_L4x16_SUB0 ble- LDGEMM_L4x16_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble LDGEMM_L4x16_SUB4 ble- LDGEMM_L4x16_SUB4
.align 4 .align 4
LDGEMM_L4x16_LOOP_START: LDGEMM_L4x16_LOOP_START:
@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START:
addic. L, L, -2 addic. L, L, -2
KERNEL4x16_L2 KERNEL4x16_L2
ble LDGEMM_L4x16_LOOP_END ble- LDGEMM_L4x16_LOOP_END
mtctr L
.align 4 .align 4
@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL4x16_L1 KERNEL4x16_L1
dcbt AO, PRE dcbt AO, PRE
addic. L, L, -1 // addic. L, L, -1
KERNEL4x16_L2 KERNEL4x16_L2
bgt LDGEMM_L4x16_LOOP bdnz+ LDGEMM_L4x16_LOOP
.align 4 .align 4
@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE:
SAVE4x16 SAVE4x16
addic. I, I, -1 addic. I, I, -1
bgt LDGEMM_L4x16_BEGIN bgt+ LDGEMM_L4x16_BEGIN
LDGEMM_L4x16_END: LDGEMM_L4x16_END:

View File

@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x16 .macro SAVE4x16
mr T1, CO add T2, CO, LDC
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
lxvd2x vs0, 0, CO lxvd2x vs0, 0, CO
lxvd2x vs1, o16, CO lxvd2x vs1, o16, CO
@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs3, o48, CO lxvd2x vs3, o48, CO
lxvd2x vs4, o64, CO lxvd2x vs4, o64, CO
lxvd2x vs5, o80, CO lxvd2x vs5, o80, CO
add T3, T2, LDC
lxvd2x vs6, o96, CO lxvd2x vs6, o96, CO
lxvd2x vs7, o112, CO lxvd2x vs7, o112, CO
@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs11, o48, T2 lxvd2x vs11, o48, T2
lxvd2x vs12, o64, T2 lxvd2x vs12, o64, T2
lxvd2x vs13, o80, T2 lxvd2x vs13, o80, T2
add T4, T3, LDC
lxvd2x vs14, o96, T2 lxvd2x vs14, o96, T2
lxvd2x vs15, o112, T2 lxvd2x vs15, o112, T2
@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs31, o112, T3 lxvd2x vs31, o112, T3
xvmaddadp vs0, vs32, alpha_r xvmaddadp vs0, vs32, alpha_r
xvmaddadp vs1, vs33, alpha_r
xvmaddadp vs2, vs34, alpha_r
xvmaddadp vs3, vs35, alpha_r
xvmaddadp vs4, vs36, alpha_r
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
lxvd2x vs32, 0, T4 lxvd2x vs32, 0, T4
xvmaddadp vs1, vs33, alpha_r
lxvd2x vs33, o16, T4 lxvd2x vs33, o16, T4
xvmaddadp vs2, vs34, alpha_r
lxvd2x vs34, o32, T4 lxvd2x vs34, o32, T4
xvmaddadp vs3, vs35, alpha_r
lxvd2x vs35, o48, T4 lxvd2x vs35, o48, T4
xvmaddadp vs4, vs36, alpha_r
lxvd2x vs36, o64, T4 lxvd2x vs36, o64, T4
xvmaddadp vs5, vs37, alpha_r
lxvd2x vs37, o80, T4 lxvd2x vs37, o80, T4
xvmaddadp vs6, vs38, alpha_r
lxvd2x vs38, o96, T4 lxvd2x vs38, o96, T4
xvmaddadp vs7, vs39, alpha_r
lxvd2x vs39, o112, T4 lxvd2x vs39, o112, T4
xvmaddadp vs8, vs40, alpha_r xvmaddadp vs8, vs40, alpha_r
@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs10, vs42, alpha_r xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r xvmaddadp vs11, vs43, alpha_r
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
xvmaddadp vs12, vs44, alpha_r xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r xvmaddadp vs15, vs47, alpha_r
stxvd2x vs4, o64, T1
stxvd2x vs5, o80, T1
stxvd2x vs6, o96, T1
stxvd2x vs7, o112, T1
xvmaddadp vs24, vs48, alpha_r xvmaddadp vs24, vs48, alpha_r
xvmaddadp vs25, vs49, alpha_r xvmaddadp vs25, vs49, alpha_r
xvmaddadp vs26, vs50, alpha_r xvmaddadp vs26, vs50, alpha_r
xvmaddadp vs27, vs51, alpha_r xvmaddadp vs27, vs51, alpha_r
stxvd2x vs8, o0, T2
stxvd2x vs9, o16, T2
stxvd2x vs10, o32, T2
stxvd2x vs11, o48, T2
xvmaddadp vs28, vs52, alpha_r xvmaddadp vs28, vs52, alpha_r
xvmaddadp vs29, vs53, alpha_r xvmaddadp vs29, vs53, alpha_r
xvmaddadp vs30, vs54, alpha_r xvmaddadp vs30, vs54, alpha_r
xvmaddadp vs31, vs55, alpha_r xvmaddadp vs31, vs55, alpha_r
stxvd2x vs12, o64, T2 stxvd2x vs0, 0, CO
stxvd2x vs13, o80, T2 stxvd2x vs1, o16, CO
stxvd2x vs14, o96, T2 stxvd2x vs2, o32, CO
stxvd2x vs15, o112, T2 stxvd2x vs3, o48, CO
stxvd2x vs4, o64, CO
stxvd2x vs5, o80, CO
stxvd2x vs6, o96, CO
stxvd2x vs7, o112, CO
xvmaddadp vs32, vs56, alpha_r xvmaddadp vs32, vs56, alpha_r
xvmaddadp vs33, vs57, alpha_r xvmaddadp vs33, vs57, alpha_r
xvmaddadp vs34, vs58, alpha_r xvmaddadp vs34, vs58, alpha_r
xvmaddadp vs35, vs59, alpha_r xvmaddadp vs35, vs59, alpha_r
stxvd2x vs24, 0, T3
stxvd2x vs25, o16, T3
stxvd2x vs26, o32, T3
stxvd2x vs27, o48, T3
xvmaddadp vs36, vs60, alpha_r xvmaddadp vs36, vs60, alpha_r
xvmaddadp vs37, vs61, alpha_r xvmaddadp vs37, vs61, alpha_r
xvmaddadp vs38, vs62, alpha_r xvmaddadp vs38, vs62, alpha_r
xvmaddadp vs39, vs63, alpha_r xvmaddadp vs39, vs63, alpha_r
addi CO, CO, 128
stxvd2x vs8, o0, T2
stxvd2x vs9, o16, T2
stxvd2x vs10, o32, T2
stxvd2x vs11, o48, T2
stxvd2x vs12, o64, T2
stxvd2x vs13, o80, T2
stxvd2x vs14, o96, T2
stxvd2x vs15, o112, T2
stxvd2x vs24, 0, T3
stxvd2x vs25, o16, T3
stxvd2x vs28, o64, T3 stxvd2x vs28, o64, T3
stxvd2x vs29, o80, T3 stxvd2x vs29, o80, T3
stxvd2x vs26, o32, T3
stxvd2x vs27, o48, T3
stxvd2x vs30, o96, T3 stxvd2x vs30, o96, T3
stxvd2x vs31, o112, T3 stxvd2x vs31, o112, T3
@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T4 stxvd2x vs34, o32, T4
stxvd2x vs35, o48, T4 stxvd2x vs35, o48, T4
addi CO, CO, 128
stxvd2x vs36, o64, T4 stxvd2x vs36, o64, T4
stxvd2x vs37, o80, T4 stxvd2x vs37, o80, T4
stxvd2x vs38, o96, T4 stxvd2x vs38, o96, T4

View File

@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DNUMOPT 8 #define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 4096 #define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x03fffUL #define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_N 8
@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_P 320 #define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_Q 640 #define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 640 #define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 640 #define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640