optimized dgemm for POWER8

This commit is contained in:
Werner Saar
2016-04-27 14:01:08 +02:00
parent 6abec09eb4
commit 0d0c6f7d7d
9 changed files with 3998 additions and 494 deletions

View File

@@ -21,12 +21,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = dgemm_tcopy_16_power8.S
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c

View File

@@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define o0 0
#define T4 r12
#define T3 r11
#define o8 r15
#define o24 r16
#define ALPHA r17
@@ -265,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi ALPHA, SP, 224
#endif
li PRE, 256
li PRE, 384
li o8 , 8
li o16, 16
li o24, 24

File diff suppressed because it is too large Load Diff

View File

@@ -431,6 +431,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mr T1, CO
addi T2, T1, 64
add T3, T1, LDC
addi T4, T3, 64
#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
@@ -442,6 +444,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3
lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif
#ifndef TRMMKERNEL
@@ -453,45 +465,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs37, alpha_r
xvmaddadp vs6, vs38, alpha_r
xvmaddadp vs7, vs39, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
xvmuldp vs2, vs34, alpha_r
xvmuldp vs3, vs35, alpha_r
xvmuldp vs4, vs36, alpha_r
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
#endif
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
add T1, T1, LDC
add T2, T2, LDC
#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1
lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif
#ifndef TRMMKERNEL
xvmaddadp vs8, vs40, alpha_r
xvmaddadp vs9, vs41, alpha_r
xvmaddadp vs10, vs42, alpha_r
@@ -501,6 +474,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs14, vs46, alpha_r
xvmaddadp vs15, vs47, alpha_r
#else
xvmuldp vs0, vs32, alpha_r
xvmuldp vs1, vs33, alpha_r
xvmuldp vs2, vs34, alpha_r
xvmuldp vs3, vs35, alpha_r
xvmuldp vs4, vs36, alpha_r
xvmuldp vs5, vs37, alpha_r
xvmuldp vs6, vs38, alpha_r
xvmuldp vs7, vs39, alpha_r
xvmuldp vs8, vs40, alpha_r
xvmuldp vs9, vs41, alpha_r
xvmuldp vs10, vs42, alpha_r
@@ -511,20 +492,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs15, vs47, alpha_r
#endif
stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3
add T1, T1, LDC
add T2, T2, LDC
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4
slwi T4, LDC, 1
add T1, T1, T4
add T3, T3, T4
addi T2, T1, 64
addi T4, T3, 64
#ifndef TRMMKERNEL
lxvd2x vs0, 0, T1
@@ -536,6 +528,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs5, o16, T2
lxvd2x vs6, o32, T2
lxvd2x vs7, o48, T2
lxvd2x vs8, 0, T3
lxvd2x vs9, o16, T3
lxvd2x vs10, o32, T3
lxvd2x vs11, o48, T3
lxvd2x vs12, 0, T4
lxvd2x vs13, o16, T4
lxvd2x vs14, o32, T4
lxvd2x vs15, o48, T4
#endif
#ifndef TRMMKERNEL
@@ -547,45 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs5, vs53, alpha_r
xvmaddadp vs6, vs54, alpha_r
xvmaddadp vs7, vs55, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
xvmuldp vs2, vs50, alpha_r
xvmuldp vs3, vs51, alpha_r
xvmuldp vs4, vs52, alpha_r
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
#endif
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
add T1, T1, LDC
add T2, T2, LDC
#ifndef TRMMKERNEL
lxvd2x vs8, 0, T1
lxvd2x vs9, o16, T1
lxvd2x vs10, o32, T1
lxvd2x vs11, o48, T1
lxvd2x vs12, 0, T2
lxvd2x vs13, o16, T2
lxvd2x vs14, o32, T2
lxvd2x vs15, o48, T2
#endif
#ifndef TRMMKERNEL
xvmaddadp vs8, vs56, alpha_r
xvmaddadp vs9, vs57, alpha_r
xvmaddadp vs10, vs58, alpha_r
@@ -595,6 +558,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmaddadp vs14, vs62, alpha_r
xvmaddadp vs15, vs63, alpha_r
#else
xvmuldp vs0, vs48, alpha_r
xvmuldp vs1, vs49, alpha_r
xvmuldp vs2, vs50, alpha_r
xvmuldp vs3, vs51, alpha_r
xvmuldp vs4, vs52, alpha_r
xvmuldp vs5, vs53, alpha_r
xvmuldp vs6, vs54, alpha_r
xvmuldp vs7, vs55, alpha_r
xvmuldp vs8, vs56, alpha_r
xvmuldp vs9, vs57, alpha_r
xvmuldp vs10, vs58, alpha_r
@@ -605,17 +576,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvmuldp vs15, vs63, alpha_r
#endif
stxvd2x vs8, 0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs0, 0, T1
stxvd2x vs1, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
dcbt T1, PRE
stxvd2x vs4, 0, T2
stxvd2x vs5, o16, T2
stxvd2x vs6, o32, T2
stxvd2x vs7, o48, T2
stxvd2x vs12, 0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
stxvd2x vs8, 0, T3
stxvd2x vs9, o16, T3
stxvd2x vs10, o32, T3
stxvd2x vs11, o48, T3
stxvd2x vs12, 0, T4
stxvd2x vs13, o16, T4
stxvd2x vs14, o32, T4
stxvd2x vs15, o48, T4
addi CO, CO, 128

View File

@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B
add B1, B1, B
li PREA, 768
li PREA, 256
addi PREB, M16, 128
li o8, 8

View File

@@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:
DCOPYT_L4x16_LOOP:
/*
addi T1, PREB, 128
addi T2, PREB, 256
*/
dcbt A0, PREA
dcbt A1, PREA
dcbt A2, PREA
dcbt A3, PREA
/*
dcbtst BO, M16
dcbtst BO, PREB
dcbtst BO, T1
dcbtst BO, T2
*/
COPY_4x16
add BO, BO, M16

View File

@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define PRE r30
#define T2 r31
#include "dgemm_macros_16x4_power8.S"
#include "dtrmm_macros_16x4_power8.S"
#ifndef NEEDPARAM

File diff suppressed because it is too large Load Diff