optimized dgemm for 20 threads

This commit is contained in:
Werner Saar 2016-05-16 14:14:25 +02:00
parent 0d1c695508
commit 8310d4d3f7
5 changed files with 191 additions and 53 deletions

View File

@ -13,10 +13,10 @@ endif
ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
endif
endif

View File

@ -803,7 +803,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#define BUFFER_SIZE ( 32 << 20)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

View File

@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDGEMM_L4_BEGIN:
mr CO, C
li T1, 128
li T2, 256
mr AO, A
slwi T1, LDC , 2
add C, C, T1
mr CO, C
slwi T3, LDC , 2
add C, C, T3
dcbt A, T1
dcbt A, T2
srawi. I, M, 4
ble LDGEMM_L4x16_END
.align 4
LDGEMM_L4x16_BEGIN_FIRST:
li L, -128
mr T1, CO
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
and T1, T1, L
and T2, T2, L
and T3, T3, L
and T4, T4, L
dcbt T1, r0
dcbt T2, r0
dcbt T3, r0
dcbt T4, r0
mr BO, B
srawi. L, K, 2
addi T1, T1, 128
addi T2, T2, 128
addi T3, T3, 128
addi T4, T4, 128
dcbt T1, r0
dcbt T2, r0
dcbt T3, r0
dcbt T4, r0
ble LDGEMM_L4x16_SUB0_FIRST
cmpwi cr0, L, 1
ble LDGEMM_L4x16_SUB4_FIRST
.align 4
LDGEMM_L4x16_LOOP_START_FIRST:
li T2, 512
li o40, 40
li o56, 56
dcbt AO, PRE
dcbt BO, T2
LOAD4x16_1
dcbt AO, PRE
KERNEL4x16_I1
dcbt AO, PRE
addic. L, L, -2
KERNEL4x16_L2
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
dcbt BO, T2
KERNEL4x16_L2
ble LDGEMM_L4x16_LOOP_END_FIRST
mtctr L
.align 4
LDGEMM_L4x16_LOOP_FIRST:
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
KERNEL4x16_L2
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
dcbt BO, T2
KERNEL4x16_L2
bdnz LDGEMM_L4x16_LOOP_FIRST
.align 4
LDGEMM_L4x16_LOOP_END_FIRST:
KERNEL4x16_L1
KERNEL4x16_L2
KERNEL4x16_1
KERNEL4x16_E2
b LDGEMM_L4x16_SUB1_FIRST
LDGEMM_L4x16_SUB4_FIRST:
KERNEL4x16_SUBI1
KERNEL4x16_SUB1
KERNEL4x16_SUB1
KERNEL4x16_SUB1
b LDGEMM_L4x16_SUB1_FIRST
LDGEMM_L4x16_SUB0_FIRST:
andi. L, K, 3
KERNEL4x16_SUBI1
addic. L, L, -1
ble LDGEMM_L4x16_SAVE_FIRST
b LDGEMM_L4x16_SUB2_FIRST
LDGEMM_L4x16_SUB1_FIRST:
andi. L, K, 3
ble LDGEMM_L4x16_SAVE_FIRST
LDGEMM_L4x16_SUB2_FIRST:
KERNEL4x16_SUB1
addic. L, L, -1
bgt LDGEMM_L4x16_SUB2_FIRST
.align 4
LDGEMM_L4x16_SAVE_FIRST:
SAVE4x16
addic. I, I, -1
ble LDGEMM_L4x16_END
LDGEMM_L4x16_END_FIRST:
.align 4
LDGEMM_L4x16_BEGIN:
@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN:
dcbt T3, r0
dcbt T4, r0
ble LDGEMM_L4x16_SUB0
ble- LDGEMM_L4x16_SUB0
cmpwi cr0, L, 1
ble LDGEMM_L4x16_SUB4
ble- LDGEMM_L4x16_SUB4
.align 4
LDGEMM_L4x16_LOOP_START:
@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START:
addic. L, L, -2
KERNEL4x16_L2
ble LDGEMM_L4x16_LOOP_END
ble- LDGEMM_L4x16_LOOP_END
mtctr L
.align 4
@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP:
dcbt AO, PRE
KERNEL4x16_L1
dcbt AO, PRE
addic. L, L, -1
// addic. L, L, -1
KERNEL4x16_L2
bgt LDGEMM_L4x16_LOOP
bdnz+ LDGEMM_L4x16_LOOP
.align 4
@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE:
SAVE4x16
addic. I, I, -1
bgt LDGEMM_L4x16_BEGIN
bgt+ LDGEMM_L4x16_BEGIN
LDGEMM_L4x16_END:

View File

@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x16
mr T1, CO
add T2, T1, LDC
add T3, T2, LDC
add T4, T3, LDC
add T2, CO, LDC
lxvd2x vs0, 0, CO
lxvd2x vs1, o16, CO
@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs3, o48, CO
lxvd2x vs4, o64, CO
lxvd2x vs5, o80, CO
add T3, T2, LDC
lxvd2x vs6, o96, CO
lxvd2x vs7, o112, CO
@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs11, o48, T2
lxvd2x vs12, o64, T2
lxvd2x vs13, o80, T2
add T4, T3, LDC
lxvd2x vs14, o96, T2
lxvd2x vs15, o112, T2
@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs31, o112, T3
xvmaddadp vs0, vs32, alpha_r
xvmaddadp vs1, vs33, alpha_r
xvmaddadp vs2, vs34, alpha_r
xvmaddadp vs3, vs35, alpha_r
xvmaddadp vs4, vs36, alpha_r
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
lxvd2x vs32, 0, T4
xvmaddadp vs1, vs33, alpha_r
lxvd2x vs33, o16, T4
xvmaddadp vs2, vs34, alpha_r
lxvd2x vs34, o32, T4
xvmaddadp vs3, vs35, alpha_r
lxvd2x vs35, o48, T4
xvmaddadp vs4, vs36, alpha_r
lxvd2x vs36, o64, T4
xvmaddadp vs5, vs37, alpha_r
lxvd2x vs37, o80, T4
xvmaddadp vs6, vs38, alpha_r
lxvd2x vs38, o96, T4
xvmaddadp vs7, vs39, alpha_r
lxvd2x vs39, o112, T4
xvmaddadp vs8, vs40, alpha_r
@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs10, vs42, alpha_r
xvmaddadp vs11, vs43, alpha_r
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
xvmaddadp vs12, vs44, alpha_r
xvmaddadp vs13, vs45, alpha_r
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
stxvd2x vs4, o64, T1
stxvd2x vs5, o80, T1
stxvd2x vs6, o96, T1
stxvd2x vs7, o112, T1
xvmaddadp vs24, vs48, alpha_r
xvmaddadp vs25, vs49, alpha_r
xvmaddadp vs26, vs50, alpha_r
xvmaddadp vs27, vs51, alpha_r
stxvd2x vs8, o0, T2
stxvd2x vs9, o16, T2
stxvd2x vs10, o32, T2
stxvd2x vs11, o48, T2
xvmaddadp vs28, vs52, alpha_r
xvmaddadp vs29, vs53, alpha_r
xvmaddadp vs30, vs54, alpha_r
xvmaddadp vs31, vs55, alpha_r
stxvd2x vs12, o64, T2
stxvd2x vs13, o80, T2
stxvd2x vs14, o96, T2
stxvd2x vs15, o112, T2
stxvd2x vs0, 0, CO
stxvd2x vs1, o16, CO
stxvd2x vs2, o32, CO
stxvd2x vs3, o48, CO
stxvd2x vs4, o64, CO
stxvd2x vs5, o80, CO
stxvd2x vs6, o96, CO
stxvd2x vs7, o112, CO
xvmaddadp vs32, vs56, alpha_r
xvmaddadp vs33, vs57, alpha_r
xvmaddadp vs34, vs58, alpha_r
xvmaddadp vs35, vs59, alpha_r
stxvd2x vs24, 0, T3
stxvd2x vs25, o16, T3
stxvd2x vs26, o32, T3
stxvd2x vs27, o48, T3
xvmaddadp vs36, vs60, alpha_r
xvmaddadp vs37, vs61, alpha_r
xvmaddadp vs38, vs62, alpha_r
xvmaddadp vs39, vs63, alpha_r
addi CO, CO, 128
stxvd2x vs8, o0, T2
stxvd2x vs9, o16, T2
stxvd2x vs10, o32, T2
stxvd2x vs11, o48, T2
stxvd2x vs12, o64, T2
stxvd2x vs13, o80, T2
stxvd2x vs14, o96, T2
stxvd2x vs15, o112, T2
stxvd2x vs24, 0, T3
stxvd2x vs25, o16, T3
stxvd2x vs28, o64, T3
stxvd2x vs29, o80, T3
stxvd2x vs26, o32, T3
stxvd2x vs27, o48, T3
stxvd2x vs30, o96, T3
stxvd2x vs31, o112, T3
@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T4
stxvd2x vs35, o48, T4
addi CO, CO, 128
stxvd2x vs36, o64, T4
stxvd2x vs37, o80, T4
stxvd2x vs38, o96, T4

View File

@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 4096
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640