Merge pull request #876 from wernsaar/develop
optimized dgemm on power8 for 20 threads
This commit is contained in:
commit
88011f625d
|
@ -13,10 +13,10 @@ endif
|
||||||
|
|
||||||
ifeq ($(CORE), POWER8)
|
ifeq ($(CORE), POWER8)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp
|
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
else
|
else
|
||||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math
|
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
|
||||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -803,7 +803,7 @@ Lmcount$lazy_ptr:
|
||||||
#elif defined(PPC440FP2)
|
#elif defined(PPC440FP2)
|
||||||
#define BUFFER_SIZE ( 16 << 20)
|
#define BUFFER_SIZE ( 16 << 20)
|
||||||
#elif defined(POWER8)
|
#elif defined(POWER8)
|
||||||
#define BUFFER_SIZE ( 32 << 20)
|
#define BUFFER_SIZE ( 64 << 20)
|
||||||
#else
|
#else
|
||||||
#define BUFFER_SIZE ( 16 << 20)
|
#define BUFFER_SIZE ( 16 << 20)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
LDGEMM_L4_BEGIN:
|
LDGEMM_L4_BEGIN:
|
||||||
|
|
||||||
mr CO, C
|
li T1, 128
|
||||||
|
li T2, 256
|
||||||
mr AO, A
|
mr AO, A
|
||||||
slwi T1, LDC , 2
|
|
||||||
add C, C, T1
|
mr CO, C
|
||||||
|
slwi T3, LDC , 2
|
||||||
|
add C, C, T3
|
||||||
|
|
||||||
|
dcbt A, T1
|
||||||
|
dcbt A, T2
|
||||||
|
|
||||||
srawi. I, M, 4
|
srawi. I, M, 4
|
||||||
ble LDGEMM_L4x16_END
|
ble LDGEMM_L4x16_END
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
LDGEMM_L4x16_BEGIN_FIRST:
|
||||||
|
|
||||||
|
li L, -128
|
||||||
|
|
||||||
|
mr T1, CO
|
||||||
|
add T2, T1, LDC
|
||||||
|
add T3, T2, LDC
|
||||||
|
add T4, T3, LDC
|
||||||
|
|
||||||
|
and T1, T1, L
|
||||||
|
and T2, T2, L
|
||||||
|
and T3, T3, L
|
||||||
|
and T4, T4, L
|
||||||
|
|
||||||
|
dcbt T1, r0
|
||||||
|
dcbt T2, r0
|
||||||
|
dcbt T3, r0
|
||||||
|
dcbt T4, r0
|
||||||
|
|
||||||
|
mr BO, B
|
||||||
|
srawi. L, K, 2
|
||||||
|
|
||||||
|
addi T1, T1, 128
|
||||||
|
addi T2, T2, 128
|
||||||
|
addi T3, T3, 128
|
||||||
|
addi T4, T4, 128
|
||||||
|
|
||||||
|
dcbt T1, r0
|
||||||
|
dcbt T2, r0
|
||||||
|
dcbt T3, r0
|
||||||
|
dcbt T4, r0
|
||||||
|
|
||||||
|
ble LDGEMM_L4x16_SUB0_FIRST
|
||||||
|
cmpwi cr0, L, 1
|
||||||
|
ble LDGEMM_L4x16_SUB4_FIRST
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
LDGEMM_L4x16_LOOP_START_FIRST:
|
||||||
|
|
||||||
|
li T2, 512
|
||||||
|
li o40, 40
|
||||||
|
li o56, 56
|
||||||
|
|
||||||
|
dcbt AO, PRE
|
||||||
|
dcbt BO, T2
|
||||||
|
LOAD4x16_1
|
||||||
|
dcbt AO, PRE
|
||||||
|
KERNEL4x16_I1
|
||||||
|
dcbt AO, PRE
|
||||||
|
addic. L, L, -2
|
||||||
|
KERNEL4x16_L2
|
||||||
|
|
||||||
|
dcbt AO, PRE
|
||||||
|
KERNEL4x16_L1
|
||||||
|
dcbt AO, PRE
|
||||||
|
dcbt BO, T2
|
||||||
|
KERNEL4x16_L2
|
||||||
|
|
||||||
|
ble LDGEMM_L4x16_LOOP_END_FIRST
|
||||||
|
mtctr L
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LDGEMM_L4x16_LOOP_FIRST:
|
||||||
|
|
||||||
|
dcbt AO, PRE
|
||||||
|
KERNEL4x16_L1
|
||||||
|
dcbt AO, PRE
|
||||||
|
KERNEL4x16_L2
|
||||||
|
|
||||||
|
dcbt AO, PRE
|
||||||
|
KERNEL4x16_L1
|
||||||
|
dcbt AO, PRE
|
||||||
|
dcbt BO, T2
|
||||||
|
KERNEL4x16_L2
|
||||||
|
|
||||||
|
bdnz LDGEMM_L4x16_LOOP_FIRST
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LDGEMM_L4x16_LOOP_END_FIRST:
|
||||||
|
|
||||||
|
KERNEL4x16_L1
|
||||||
|
KERNEL4x16_L2
|
||||||
|
|
||||||
|
KERNEL4x16_1
|
||||||
|
KERNEL4x16_E2
|
||||||
|
|
||||||
|
b LDGEMM_L4x16_SUB1_FIRST
|
||||||
|
|
||||||
|
LDGEMM_L4x16_SUB4_FIRST:
|
||||||
|
|
||||||
|
KERNEL4x16_SUBI1
|
||||||
|
KERNEL4x16_SUB1
|
||||||
|
KERNEL4x16_SUB1
|
||||||
|
KERNEL4x16_SUB1
|
||||||
|
|
||||||
|
b LDGEMM_L4x16_SUB1_FIRST
|
||||||
|
|
||||||
|
LDGEMM_L4x16_SUB0_FIRST:
|
||||||
|
|
||||||
|
andi. L, K, 3
|
||||||
|
|
||||||
|
KERNEL4x16_SUBI1
|
||||||
|
|
||||||
|
addic. L, L, -1
|
||||||
|
ble LDGEMM_L4x16_SAVE_FIRST
|
||||||
|
b LDGEMM_L4x16_SUB2_FIRST
|
||||||
|
|
||||||
|
LDGEMM_L4x16_SUB1_FIRST:
|
||||||
|
|
||||||
|
andi. L, K, 3
|
||||||
|
ble LDGEMM_L4x16_SAVE_FIRST
|
||||||
|
|
||||||
|
LDGEMM_L4x16_SUB2_FIRST:
|
||||||
|
|
||||||
|
KERNEL4x16_SUB1
|
||||||
|
|
||||||
|
addic. L, L, -1
|
||||||
|
bgt LDGEMM_L4x16_SUB2_FIRST
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
LDGEMM_L4x16_SAVE_FIRST:
|
||||||
|
|
||||||
|
SAVE4x16
|
||||||
|
|
||||||
|
addic. I, I, -1
|
||||||
|
ble LDGEMM_L4x16_END
|
||||||
|
|
||||||
|
LDGEMM_L4x16_END_FIRST:
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
LDGEMM_L4x16_BEGIN:
|
LDGEMM_L4x16_BEGIN:
|
||||||
|
|
||||||
|
@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN:
|
||||||
dcbt T3, r0
|
dcbt T3, r0
|
||||||
dcbt T4, r0
|
dcbt T4, r0
|
||||||
|
|
||||||
ble LDGEMM_L4x16_SUB0
|
ble- LDGEMM_L4x16_SUB0
|
||||||
cmpwi cr0, L, 1
|
cmpwi cr0, L, 1
|
||||||
ble LDGEMM_L4x16_SUB4
|
ble- LDGEMM_L4x16_SUB4
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
LDGEMM_L4x16_LOOP_START:
|
LDGEMM_L4x16_LOOP_START:
|
||||||
|
@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START:
|
||||||
addic. L, L, -2
|
addic. L, L, -2
|
||||||
KERNEL4x16_L2
|
KERNEL4x16_L2
|
||||||
|
|
||||||
ble LDGEMM_L4x16_LOOP_END
|
ble- LDGEMM_L4x16_LOOP_END
|
||||||
|
mtctr L
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
|
|
||||||
|
@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP:
|
||||||
dcbt AO, PRE
|
dcbt AO, PRE
|
||||||
KERNEL4x16_L1
|
KERNEL4x16_L1
|
||||||
dcbt AO, PRE
|
dcbt AO, PRE
|
||||||
addic. L, L, -1
|
// addic. L, L, -1
|
||||||
KERNEL4x16_L2
|
KERNEL4x16_L2
|
||||||
|
|
||||||
bgt LDGEMM_L4x16_LOOP
|
bdnz+ LDGEMM_L4x16_LOOP
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
|
|
||||||
|
@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE:
|
||||||
SAVE4x16
|
SAVE4x16
|
||||||
|
|
||||||
addic. I, I, -1
|
addic. I, I, -1
|
||||||
bgt LDGEMM_L4x16_BEGIN
|
bgt+ LDGEMM_L4x16_BEGIN
|
||||||
|
|
||||||
LDGEMM_L4x16_END:
|
LDGEMM_L4x16_END:
|
||||||
|
|
||||||
|
|
|
@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE4x16
|
.macro SAVE4x16
|
||||||
|
|
||||||
mr T1, CO
|
add T2, CO, LDC
|
||||||
add T2, T1, LDC
|
|
||||||
add T3, T2, LDC
|
|
||||||
add T4, T3, LDC
|
|
||||||
|
|
||||||
lxvd2x vs0, 0, CO
|
lxvd2x vs0, 0, CO
|
||||||
lxvd2x vs1, o16, CO
|
lxvd2x vs1, o16, CO
|
||||||
|
@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvd2x vs3, o48, CO
|
lxvd2x vs3, o48, CO
|
||||||
lxvd2x vs4, o64, CO
|
lxvd2x vs4, o64, CO
|
||||||
lxvd2x vs5, o80, CO
|
lxvd2x vs5, o80, CO
|
||||||
|
add T3, T2, LDC
|
||||||
lxvd2x vs6, o96, CO
|
lxvd2x vs6, o96, CO
|
||||||
lxvd2x vs7, o112, CO
|
lxvd2x vs7, o112, CO
|
||||||
|
|
||||||
|
@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvd2x vs11, o48, T2
|
lxvd2x vs11, o48, T2
|
||||||
lxvd2x vs12, o64, T2
|
lxvd2x vs12, o64, T2
|
||||||
lxvd2x vs13, o80, T2
|
lxvd2x vs13, o80, T2
|
||||||
|
add T4, T3, LDC
|
||||||
lxvd2x vs14, o96, T2
|
lxvd2x vs14, o96, T2
|
||||||
lxvd2x vs15, o112, T2
|
lxvd2x vs15, o112, T2
|
||||||
|
|
||||||
|
@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvd2x vs31, o112, T3
|
lxvd2x vs31, o112, T3
|
||||||
|
|
||||||
xvmaddadp vs0, vs32, alpha_r
|
xvmaddadp vs0, vs32, alpha_r
|
||||||
xvmaddadp vs1, vs33, alpha_r
|
|
||||||
xvmaddadp vs2, vs34, alpha_r
|
|
||||||
xvmaddadp vs3, vs35, alpha_r
|
|
||||||
xvmaddadp vs4, vs36, alpha_r
|
|
||||||
xvmaddadp vs5, vs37, alpha_r
|
|
||||||
xvmaddadp vs6, vs38, alpha_r
|
|
||||||
xvmaddadp vs7, vs39, alpha_r
|
|
||||||
|
|
||||||
lxvd2x vs32, 0, T4
|
lxvd2x vs32, 0, T4
|
||||||
|
xvmaddadp vs1, vs33, alpha_r
|
||||||
lxvd2x vs33, o16, T4
|
lxvd2x vs33, o16, T4
|
||||||
|
xvmaddadp vs2, vs34, alpha_r
|
||||||
lxvd2x vs34, o32, T4
|
lxvd2x vs34, o32, T4
|
||||||
|
xvmaddadp vs3, vs35, alpha_r
|
||||||
lxvd2x vs35, o48, T4
|
lxvd2x vs35, o48, T4
|
||||||
|
xvmaddadp vs4, vs36, alpha_r
|
||||||
lxvd2x vs36, o64, T4
|
lxvd2x vs36, o64, T4
|
||||||
|
xvmaddadp vs5, vs37, alpha_r
|
||||||
lxvd2x vs37, o80, T4
|
lxvd2x vs37, o80, T4
|
||||||
|
xvmaddadp vs6, vs38, alpha_r
|
||||||
lxvd2x vs38, o96, T4
|
lxvd2x vs38, o96, T4
|
||||||
|
xvmaddadp vs7, vs39, alpha_r
|
||||||
lxvd2x vs39, o112, T4
|
lxvd2x vs39, o112, T4
|
||||||
|
|
||||||
xvmaddadp vs8, vs40, alpha_r
|
xvmaddadp vs8, vs40, alpha_r
|
||||||
|
@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xvmaddadp vs10, vs42, alpha_r
|
xvmaddadp vs10, vs42, alpha_r
|
||||||
xvmaddadp vs11, vs43, alpha_r
|
xvmaddadp vs11, vs43, alpha_r
|
||||||
|
|
||||||
stxvd2x vs0, 0, T1
|
|
||||||
stxvd2x vs1, o16, T1
|
|
||||||
stxvd2x vs2, o32, T1
|
|
||||||
stxvd2x vs3, o48, T1
|
|
||||||
|
|
||||||
xvmaddadp vs12, vs44, alpha_r
|
xvmaddadp vs12, vs44, alpha_r
|
||||||
xvmaddadp vs13, vs45, alpha_r
|
xvmaddadp vs13, vs45, alpha_r
|
||||||
xvmaddadp vs14, vs46, alpha_r
|
xvmaddadp vs14, vs46, alpha_r
|
||||||
xvmaddadp vs15, vs47, alpha_r
|
xvmaddadp vs15, vs47, alpha_r
|
||||||
|
|
||||||
stxvd2x vs4, o64, T1
|
|
||||||
stxvd2x vs5, o80, T1
|
|
||||||
stxvd2x vs6, o96, T1
|
|
||||||
stxvd2x vs7, o112, T1
|
|
||||||
|
|
||||||
xvmaddadp vs24, vs48, alpha_r
|
xvmaddadp vs24, vs48, alpha_r
|
||||||
xvmaddadp vs25, vs49, alpha_r
|
xvmaddadp vs25, vs49, alpha_r
|
||||||
xvmaddadp vs26, vs50, alpha_r
|
xvmaddadp vs26, vs50, alpha_r
|
||||||
xvmaddadp vs27, vs51, alpha_r
|
xvmaddadp vs27, vs51, alpha_r
|
||||||
|
|
||||||
stxvd2x vs8, o0, T2
|
|
||||||
stxvd2x vs9, o16, T2
|
|
||||||
stxvd2x vs10, o32, T2
|
|
||||||
stxvd2x vs11, o48, T2
|
|
||||||
|
|
||||||
xvmaddadp vs28, vs52, alpha_r
|
xvmaddadp vs28, vs52, alpha_r
|
||||||
xvmaddadp vs29, vs53, alpha_r
|
xvmaddadp vs29, vs53, alpha_r
|
||||||
xvmaddadp vs30, vs54, alpha_r
|
xvmaddadp vs30, vs54, alpha_r
|
||||||
xvmaddadp vs31, vs55, alpha_r
|
xvmaddadp vs31, vs55, alpha_r
|
||||||
|
|
||||||
stxvd2x vs12, o64, T2
|
stxvd2x vs0, 0, CO
|
||||||
stxvd2x vs13, o80, T2
|
stxvd2x vs1, o16, CO
|
||||||
stxvd2x vs14, o96, T2
|
stxvd2x vs2, o32, CO
|
||||||
stxvd2x vs15, o112, T2
|
stxvd2x vs3, o48, CO
|
||||||
|
|
||||||
|
stxvd2x vs4, o64, CO
|
||||||
|
stxvd2x vs5, o80, CO
|
||||||
|
stxvd2x vs6, o96, CO
|
||||||
|
stxvd2x vs7, o112, CO
|
||||||
|
|
||||||
xvmaddadp vs32, vs56, alpha_r
|
xvmaddadp vs32, vs56, alpha_r
|
||||||
xvmaddadp vs33, vs57, alpha_r
|
xvmaddadp vs33, vs57, alpha_r
|
||||||
xvmaddadp vs34, vs58, alpha_r
|
xvmaddadp vs34, vs58, alpha_r
|
||||||
xvmaddadp vs35, vs59, alpha_r
|
xvmaddadp vs35, vs59, alpha_r
|
||||||
|
|
||||||
stxvd2x vs24, 0, T3
|
|
||||||
stxvd2x vs25, o16, T3
|
|
||||||
stxvd2x vs26, o32, T3
|
|
||||||
stxvd2x vs27, o48, T3
|
|
||||||
|
|
||||||
xvmaddadp vs36, vs60, alpha_r
|
xvmaddadp vs36, vs60, alpha_r
|
||||||
xvmaddadp vs37, vs61, alpha_r
|
xvmaddadp vs37, vs61, alpha_r
|
||||||
xvmaddadp vs38, vs62, alpha_r
|
xvmaddadp vs38, vs62, alpha_r
|
||||||
xvmaddadp vs39, vs63, alpha_r
|
xvmaddadp vs39, vs63, alpha_r
|
||||||
|
|
||||||
|
addi CO, CO, 128
|
||||||
|
|
||||||
|
stxvd2x vs8, o0, T2
|
||||||
|
stxvd2x vs9, o16, T2
|
||||||
|
stxvd2x vs10, o32, T2
|
||||||
|
stxvd2x vs11, o48, T2
|
||||||
|
|
||||||
|
stxvd2x vs12, o64, T2
|
||||||
|
stxvd2x vs13, o80, T2
|
||||||
|
stxvd2x vs14, o96, T2
|
||||||
|
stxvd2x vs15, o112, T2
|
||||||
|
|
||||||
|
stxvd2x vs24, 0, T3
|
||||||
|
stxvd2x vs25, o16, T3
|
||||||
stxvd2x vs28, o64, T3
|
stxvd2x vs28, o64, T3
|
||||||
stxvd2x vs29, o80, T3
|
stxvd2x vs29, o80, T3
|
||||||
|
|
||||||
|
stxvd2x vs26, o32, T3
|
||||||
|
stxvd2x vs27, o48, T3
|
||||||
stxvd2x vs30, o96, T3
|
stxvd2x vs30, o96, T3
|
||||||
stxvd2x vs31, o112, T3
|
stxvd2x vs31, o112, T3
|
||||||
|
|
||||||
|
@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs34, o32, T4
|
stxvd2x vs34, o32, T4
|
||||||
stxvd2x vs35, o48, T4
|
stxvd2x vs35, o48, T4
|
||||||
|
|
||||||
addi CO, CO, 128
|
|
||||||
|
|
||||||
stxvd2x vs36, o64, T4
|
stxvd2x vs36, o64, T4
|
||||||
stxvd2x vs37, o80, T4
|
stxvd2x vs37, o80, T4
|
||||||
stxvd2x vs38, o96, T4
|
stxvd2x vs38, o96, T4
|
||||||
|
|
6
param.h
6
param.h
|
@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define DNUMOPT 8
|
#define DNUMOPT 8
|
||||||
|
|
||||||
#define GEMM_DEFAULT_OFFSET_A 0
|
#define GEMM_DEFAULT_OFFSET_A 0
|
||||||
#define GEMM_DEFAULT_OFFSET_B 4096
|
#define GEMM_DEFAULT_OFFSET_B 65536
|
||||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||||
|
@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define ZGEMM_DEFAULT_P 320
|
#define ZGEMM_DEFAULT_P 320
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 640
|
#define SGEMM_DEFAULT_Q 640
|
||||||
#define DGEMM_DEFAULT_Q 640
|
#define DGEMM_DEFAULT_Q 720
|
||||||
#define CGEMM_DEFAULT_Q 640
|
#define CGEMM_DEFAULT_Q 640
|
||||||
#define ZGEMM_DEFAULT_Q 640
|
#define ZGEMM_DEFAULT_Q 640
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue