optimized dgemm for POWER8

This commit is contained in:
Werner Saar 2016-04-27 14:01:08 +02:00
parent 6abec09eb4
commit 0d0c6f7d7d
9 changed files with 3998 additions and 494 deletions

View File

@ -21,12 +21,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c

View File

@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define o0 0
#define T4 r12
#define T3 r11
#define o8 r15
#define o24 r16
#define ALPHA r17
@ -265,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi ALPHA, SP, 224
#endif
li PRE, 256
li PRE, 384
li o8 , 8
li o16, 16
li o24, 24

File diff suppressed because it is too large Load Diff

View File

@ -431,6 +431,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mr T1, CO
addi T2, T1, 64
add T3, T1, LDC
addi T4, T3, 64
#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
@ -442,6 +444,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3
lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif
#ifndef TRMMKERNEL
@ -453,45 +465,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
xvmuldp vs2, vs34, alpha_r
xvmuldp vs3, vs35, alpha_r
xvmuldp vs4, vs36, alpha_r
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
#endif
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
add T1, T1, LDC
add T2, T2, LDC
#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1
lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif
#ifndef TRMMKERNEL
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
@ -501,6 +474,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
xvmuldp vs2, vs34, alpha_r
xvmuldp vs3, vs35, alpha_r
xvmuldp vs4, vs36, alpha_r
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
xvmuldp vs8, vs40, alpha_r
xvmuldp vs9, vs41, alpha_r
xvmuldp vs10, vs42, alpha_r
@ -511,20 +492,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs15, vs47, alpha_r
#endif
stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3
add T1, T1, LDC
add T2, T2, LDC
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4
slwi T4, LDC, 1
add T1, T1, T4
add T3, T3, T4
addi T2, T1, 64
addi T4, T3, 64
#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
@ -536,6 +528,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3
lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif
#ifndef TRMMKERNEL
@ -547,45 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs53, alpha_r
xvmaddadp vs6, vs54, alpha_r
xvmaddadp vs7, vs55, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
xvmuldp vs2, vs50, alpha_r
xvmuldp vs3, vs51, alpha_r
xvmuldp vs4, vs52, alpha_r
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
#endif
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
add T1, T1, LDC
add T2, T2, LDC
#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1
lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif
#ifndef TRMMKERNEL
xvmaddadp vs8, vs56, alpha_r
xvmaddadp vs9, vs57, alpha_r
xvmaddadp vs10, vs58, alpha_r
@ -595,6 +558,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs14, vs62, alpha_r
xvmaddadp vs15, vs63, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
xvmuldp vs2, vs50, alpha_r
xvmuldp vs3, vs51, alpha_r
xvmuldp vs4, vs52, alpha_r
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
xvmuldp vs8, vs56, alpha_r
xvmuldp vs9, vs57, alpha_r
xvmuldp vs10, vs58, alpha_r
@ -605,17 +576,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs15, vs63, alpha_r
#endif
stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4
addi CO, CO, 128

View File

@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B
add B1, B1, B
li PREA, 768
li PREA, 256
addi PREB, M16, 128
li o8, 8

View File

@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:
DCOPYT_L4x16_LOOP:
/*
addi T1, PREB, 128
addi T2, PREB, 256
*/
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
/*
dcbtst BO, M16
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
*/
COPY_4x16
add BO, BO, M16

View File

@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PRE r30
#define T2 r31
#include "dgemm_macros_16x4_power8.S"
#include "dtrmm_macros_16x4_power8.S"
#ifndef NEEDPARAM

File diff suppressed because it is too large Load Diff

99
param.h
View File

@ -410,7 +410,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#if defined(STEAMROLLER) || defined(EXCAVATOR)
#ifdef STEAMROLLER
#define SNUMOPT 8
#define DNUMOPT 4
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#else
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_M 2
#define XGEMM_DEFAULT_UNROLL_M 1
#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 4
#define ZGEMM3M_DEFAULT_UNROLL_M 4
#define GEMV_UNROLL 8
#endif
#if defined(ARCH_X86_64)
#define SGEMM_DEFAULT_P 768
#define DGEMM_DEFAULT_P 576
#define ZGEMM_DEFAULT_P 288
#define CGEMM_DEFAULT_P 576
#else
#define SGEMM_DEFAULT_P 448
#define DGEMM_DEFAULT_P 480
#define ZGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
#endif
#define QGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56
#if defined(ARCH_X86_64)
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 160
#define ZGEMM_DEFAULT_Q 160
#define CGEMM_DEFAULT_Q 160
#else
#define SGEMM_DEFAULT_Q 224
#define DGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 224
#define CGEMM_DEFAULT_Q 224
#endif
#define QGEMM_DEFAULT_Q 224
#define XGEMM_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_P 448
#define ZGEMM3M_DEFAULT_P 224
#define XGEMM3M_DEFAULT_P 112
#define CGEMM3M_DEFAULT_Q 224
#define ZGEMM3M_DEFAULT_Q 224
#define XGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_R 12288
#define ZGEMM3M_DEFAULT_R 12288
#define XGEMM3M_DEFAULT_R 12288
#define SGEMM_DEFAULT_R 12288
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R 12288
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define XGEMM_DEFAULT_R xgemm_r
#define SYMV_P 16
#define HAVE_EXCLUSIVE_CACHE
#define GEMM_THREAD gemm_thread_mn
#endif
#ifdef EXCAVATOR
#define SNUMOPT 8
#define DNUMOPT 4
@ -1885,12 +1978,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 1280
#define DGEMM_DEFAULT_P 640
#define DGEMM_DEFAULT_P 768
#define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 768
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640