optimized dgemm for 20 threads
This commit is contained in:
parent
0d1c695508
commit
8310d4d3f7
|
@ -13,10 +13,10 @@ endif
|
|||
|
||||
ifeq ($(CORE), POWER8)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||
else
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math
|
||||
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
|
|
@ -803,7 +803,7 @@ Lmcount$lazy_ptr:
|
|||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER8)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define BUFFER_SIZE ( 64 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
|
|
|
@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
LDGEMM_L4_BEGIN:
|
||||
|
||||
mr CO, C
|
||||
li T1, 128
|
||||
li T2, 256
|
||||
mr AO, A
|
||||
slwi T1, LDC , 2
|
||||
add C, C, T1
|
||||
|
||||
mr CO, C
|
||||
slwi T3, LDC , 2
|
||||
add C, C, T3
|
||||
|
||||
dcbt A, T1
|
||||
dcbt A, T2
|
||||
|
||||
srawi. I, M, 4
|
||||
ble LDGEMM_L4x16_END
|
||||
|
||||
.align 4
|
||||
LDGEMM_L4x16_BEGIN_FIRST:
|
||||
|
||||
li L, -128
|
||||
|
||||
mr T1, CO
|
||||
add T2, T1, LDC
|
||||
add T3, T2, LDC
|
||||
add T4, T3, LDC
|
||||
|
||||
and T1, T1, L
|
||||
and T2, T2, L
|
||||
and T3, T3, L
|
||||
and T4, T4, L
|
||||
|
||||
dcbt T1, r0
|
||||
dcbt T2, r0
|
||||
dcbt T3, r0
|
||||
dcbt T4, r0
|
||||
|
||||
mr BO, B
|
||||
srawi. L, K, 2
|
||||
|
||||
addi T1, T1, 128
|
||||
addi T2, T2, 128
|
||||
addi T3, T3, 128
|
||||
addi T4, T4, 128
|
||||
|
||||
dcbt T1, r0
|
||||
dcbt T2, r0
|
||||
dcbt T3, r0
|
||||
dcbt T4, r0
|
||||
|
||||
ble LDGEMM_L4x16_SUB0_FIRST
|
||||
cmpwi cr0, L, 1
|
||||
ble LDGEMM_L4x16_SUB4_FIRST
|
||||
|
||||
.align 4
|
||||
LDGEMM_L4x16_LOOP_START_FIRST:
|
||||
|
||||
li T2, 512
|
||||
li o40, 40
|
||||
li o56, 56
|
||||
|
||||
dcbt AO, PRE
|
||||
dcbt BO, T2
|
||||
LOAD4x16_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_I1
|
||||
dcbt AO, PRE
|
||||
addic. L, L, -2
|
||||
KERNEL4x16_L2
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_L1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, T2
|
||||
KERNEL4x16_L2
|
||||
|
||||
ble LDGEMM_L4x16_LOOP_END_FIRST
|
||||
mtctr L
|
||||
|
||||
.align 4
|
||||
|
||||
LDGEMM_L4x16_LOOP_FIRST:
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_L1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_L2
|
||||
|
||||
dcbt AO, PRE
|
||||
KERNEL4x16_L1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, T2
|
||||
KERNEL4x16_L2
|
||||
|
||||
bdnz LDGEMM_L4x16_LOOP_FIRST
|
||||
|
||||
.align 4
|
||||
|
||||
LDGEMM_L4x16_LOOP_END_FIRST:
|
||||
|
||||
KERNEL4x16_L1
|
||||
KERNEL4x16_L2
|
||||
|
||||
KERNEL4x16_1
|
||||
KERNEL4x16_E2
|
||||
|
||||
b LDGEMM_L4x16_SUB1_FIRST
|
||||
|
||||
LDGEMM_L4x16_SUB4_FIRST:
|
||||
|
||||
KERNEL4x16_SUBI1
|
||||
KERNEL4x16_SUB1
|
||||
KERNEL4x16_SUB1
|
||||
KERNEL4x16_SUB1
|
||||
|
||||
b LDGEMM_L4x16_SUB1_FIRST
|
||||
|
||||
LDGEMM_L4x16_SUB0_FIRST:
|
||||
|
||||
andi. L, K, 3
|
||||
|
||||
KERNEL4x16_SUBI1
|
||||
|
||||
addic. L, L, -1
|
||||
ble LDGEMM_L4x16_SAVE_FIRST
|
||||
b LDGEMM_L4x16_SUB2_FIRST
|
||||
|
||||
LDGEMM_L4x16_SUB1_FIRST:
|
||||
|
||||
andi. L, K, 3
|
||||
ble LDGEMM_L4x16_SAVE_FIRST
|
||||
|
||||
LDGEMM_L4x16_SUB2_FIRST:
|
||||
|
||||
KERNEL4x16_SUB1
|
||||
|
||||
addic. L, L, -1
|
||||
bgt LDGEMM_L4x16_SUB2_FIRST
|
||||
|
||||
.align 4
|
||||
LDGEMM_L4x16_SAVE_FIRST:
|
||||
|
||||
SAVE4x16
|
||||
|
||||
addic. I, I, -1
|
||||
ble LDGEMM_L4x16_END
|
||||
|
||||
LDGEMM_L4x16_END_FIRST:
|
||||
|
||||
.align 4
|
||||
LDGEMM_L4x16_BEGIN:
|
||||
|
||||
|
@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN:
|
|||
dcbt T3, r0
|
||||
dcbt T4, r0
|
||||
|
||||
ble LDGEMM_L4x16_SUB0
|
||||
ble- LDGEMM_L4x16_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
ble LDGEMM_L4x16_SUB4
|
||||
ble- LDGEMM_L4x16_SUB4
|
||||
|
||||
.align 4
|
||||
LDGEMM_L4x16_LOOP_START:
|
||||
|
@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START:
|
|||
addic. L, L, -2
|
||||
KERNEL4x16_L2
|
||||
|
||||
ble LDGEMM_L4x16_LOOP_END
|
||||
ble- LDGEMM_L4x16_LOOP_END
|
||||
mtctr L
|
||||
|
||||
.align 4
|
||||
|
||||
|
@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP:
|
|||
dcbt AO, PRE
|
||||
KERNEL4x16_L1
|
||||
dcbt AO, PRE
|
||||
addic. L, L, -1
|
||||
// addic. L, L, -1
|
||||
KERNEL4x16_L2
|
||||
|
||||
bgt LDGEMM_L4x16_LOOP
|
||||
bdnz+ LDGEMM_L4x16_LOOP
|
||||
|
||||
.align 4
|
||||
|
||||
|
@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE:
|
|||
SAVE4x16
|
||||
|
||||
addic. I, I, -1
|
||||
bgt LDGEMM_L4x16_BEGIN
|
||||
bgt+ LDGEMM_L4x16_BEGIN
|
||||
|
||||
LDGEMM_L4x16_END:
|
||||
|
||||
|
|
|
@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE4x16
|
||||
|
||||
mr T1, CO
|
||||
add T2, T1, LDC
|
||||
add T3, T2, LDC
|
||||
add T4, T3, LDC
|
||||
add T2, CO, LDC
|
||||
|
||||
lxvd2x vs0, 0, CO
|
||||
lxvd2x vs1, o16, CO
|
||||
|
@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvd2x vs3, o48, CO
|
||||
lxvd2x vs4, o64, CO
|
||||
lxvd2x vs5, o80, CO
|
||||
add T3, T2, LDC
|
||||
lxvd2x vs6, o96, CO
|
||||
lxvd2x vs7, o112, CO
|
||||
|
||||
|
@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvd2x vs11, o48, T2
|
||||
lxvd2x vs12, o64, T2
|
||||
lxvd2x vs13, o80, T2
|
||||
add T4, T3, LDC
|
||||
lxvd2x vs14, o96, T2
|
||||
lxvd2x vs15, o112, T2
|
||||
|
||||
|
@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
lxvd2x vs31, o112, T3
|
||||
|
||||
xvmaddadp vs0, vs32, alpha_r
|
||||
xvmaddadp vs1, vs33, alpha_r
|
||||
xvmaddadp vs2, vs34, alpha_r
|
||||
xvmaddadp vs3, vs35, alpha_r
|
||||
xvmaddadp vs4, vs36, alpha_r
|
||||
xvmaddadp vs5, vs37, alpha_r
|
||||
xvmaddadp vs6, vs38, alpha_r
|
||||
xvmaddadp vs7, vs39, alpha_r
|
||||
|
||||
lxvd2x vs32, 0, T4
|
||||
xvmaddadp vs1, vs33, alpha_r
|
||||
lxvd2x vs33, o16, T4
|
||||
xvmaddadp vs2, vs34, alpha_r
|
||||
lxvd2x vs34, o32, T4
|
||||
xvmaddadp vs3, vs35, alpha_r
|
||||
lxvd2x vs35, o48, T4
|
||||
xvmaddadp vs4, vs36, alpha_r
|
||||
lxvd2x vs36, o64, T4
|
||||
xvmaddadp vs5, vs37, alpha_r
|
||||
lxvd2x vs37, o80, T4
|
||||
xvmaddadp vs6, vs38, alpha_r
|
||||
lxvd2x vs38, o96, T4
|
||||
xvmaddadp vs7, vs39, alpha_r
|
||||
lxvd2x vs39, o112, T4
|
||||
|
||||
xvmaddadp vs8, vs40, alpha_r
|
||||
|
@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvmaddadp vs10, vs42, alpha_r
|
||||
xvmaddadp vs11, vs43, alpha_r
|
||||
|
||||
stxvd2x vs0, 0, T1
|
||||
stxvd2x vs1, o16, T1
|
||||
stxvd2x vs2, o32, T1
|
||||
stxvd2x vs3, o48, T1
|
||||
|
||||
xvmaddadp vs12, vs44, alpha_r
|
||||
xvmaddadp vs13, vs45, alpha_r
|
||||
xvmaddadp vs14, vs46, alpha_r
|
||||
xvmaddadp vs15, vs47, alpha_r
|
||||
|
||||
stxvd2x vs4, o64, T1
|
||||
stxvd2x vs5, o80, T1
|
||||
stxvd2x vs6, o96, T1
|
||||
stxvd2x vs7, o112, T1
|
||||
|
||||
xvmaddadp vs24, vs48, alpha_r
|
||||
xvmaddadp vs25, vs49, alpha_r
|
||||
xvmaddadp vs26, vs50, alpha_r
|
||||
xvmaddadp vs27, vs51, alpha_r
|
||||
|
||||
stxvd2x vs8, o0, T2
|
||||
stxvd2x vs9, o16, T2
|
||||
stxvd2x vs10, o32, T2
|
||||
stxvd2x vs11, o48, T2
|
||||
|
||||
xvmaddadp vs28, vs52, alpha_r
|
||||
xvmaddadp vs29, vs53, alpha_r
|
||||
xvmaddadp vs30, vs54, alpha_r
|
||||
xvmaddadp vs31, vs55, alpha_r
|
||||
|
||||
stxvd2x vs12, o64, T2
|
||||
stxvd2x vs13, o80, T2
|
||||
stxvd2x vs14, o96, T2
|
||||
stxvd2x vs15, o112, T2
|
||||
stxvd2x vs0, 0, CO
|
||||
stxvd2x vs1, o16, CO
|
||||
stxvd2x vs2, o32, CO
|
||||
stxvd2x vs3, o48, CO
|
||||
|
||||
stxvd2x vs4, o64, CO
|
||||
stxvd2x vs5, o80, CO
|
||||
stxvd2x vs6, o96, CO
|
||||
stxvd2x vs7, o112, CO
|
||||
|
||||
xvmaddadp vs32, vs56, alpha_r
|
||||
xvmaddadp vs33, vs57, alpha_r
|
||||
xvmaddadp vs34, vs58, alpha_r
|
||||
xvmaddadp vs35, vs59, alpha_r
|
||||
|
||||
stxvd2x vs24, 0, T3
|
||||
stxvd2x vs25, o16, T3
|
||||
stxvd2x vs26, o32, T3
|
||||
stxvd2x vs27, o48, T3
|
||||
|
||||
xvmaddadp vs36, vs60, alpha_r
|
||||
xvmaddadp vs37, vs61, alpha_r
|
||||
xvmaddadp vs38, vs62, alpha_r
|
||||
xvmaddadp vs39, vs63, alpha_r
|
||||
|
||||
addi CO, CO, 128
|
||||
|
||||
stxvd2x vs8, o0, T2
|
||||
stxvd2x vs9, o16, T2
|
||||
stxvd2x vs10, o32, T2
|
||||
stxvd2x vs11, o48, T2
|
||||
|
||||
stxvd2x vs12, o64, T2
|
||||
stxvd2x vs13, o80, T2
|
||||
stxvd2x vs14, o96, T2
|
||||
stxvd2x vs15, o112, T2
|
||||
|
||||
stxvd2x vs24, 0, T3
|
||||
stxvd2x vs25, o16, T3
|
||||
stxvd2x vs28, o64, T3
|
||||
stxvd2x vs29, o80, T3
|
||||
|
||||
stxvd2x vs26, o32, T3
|
||||
stxvd2x vs27, o48, T3
|
||||
stxvd2x vs30, o96, T3
|
||||
stxvd2x vs31, o112, T3
|
||||
|
||||
|
@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvd2x vs34, o32, T4
|
||||
stxvd2x vs35, o48, T4
|
||||
|
||||
addi CO, CO, 128
|
||||
|
||||
stxvd2x vs36, o64, T4
|
||||
stxvd2x vs37, o80, T4
|
||||
stxvd2x vs38, o96, T4
|
||||
|
|
6
param.h
6
param.h
|
@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define DNUMOPT 8
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 4096
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_OFFSET_B 65536
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
|
@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ZGEMM_DEFAULT_P 320
|
||||
|
||||
#define SGEMM_DEFAULT_Q 640
|
||||
#define DGEMM_DEFAULT_Q 640
|
||||
#define DGEMM_DEFAULT_Q 720
|
||||
#define CGEMM_DEFAULT_Q 640
|
||||
#define ZGEMM_DEFAULT_Q 640
|
||||
|
||||
|
|
Loading…
Reference in New Issue