updated optimized cgemm- and ctrmm-kernel for POWER8

This commit is contained in:
Werner Saar 2016-04-04 09:12:08 +02:00
parent e1cdd15b30
commit c5b1fbcb2e
7 changed files with 7397 additions and 909 deletions

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/************************************************************************************** /**************************************************************************************
* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * 2016/04/04 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -137,12 +137,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha_si vs31 #define alpha_si vs31
#define NOTUSED r14 #define BBUFFER r14
#define L r15 #define L r15
#define o12 r16 #define o12 r16
#define o4 r17 #define o4 r17
#define T2 r19 #define T2 r19
#define KK r20 #define BBO r20
#define o8 r21 #define o8 r21
#define I r22 #define I r22
#define J r23 #define J r23
@ -290,6 +290,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
li o32 , 32 li o32 , 32
li o48 , 48 li o48 , 48
li T1, 256
slwi T1, T1, 9 // 131072
sub BBUFFER, A, T1 // temp buffer for B unrolled
#ifdef __64BIT__ #ifdef __64BIT__
addi T1 , SP, 296 addi T1 , SP, 296

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/************************************************************************************** /**************************************************************************************
* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * 2016/04/04 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CGEMM_L4_BEGIN: CGEMM_L4_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 3
CGEMM_L4_COPYB:
dcbtst BBO, PRE
lxvw4x vs3, o0, BO
lxvw4x vs11, o16, BO
xxspltw vs4, vs3, 0
xxspltw vs5, vs3, 1
xxspltw vs6, vs3, 2
xxspltw vs7, vs3, 3
xxspltw vs12, vs11, 0
xxspltw vs13, vs11, 1
xxspltw vs14, vs11, 2
xxspltw vs15, vs11, 3
stxvw4x vs4, o0, BBO
stxvw4x vs5, o16, BBO
stxvw4x vs6, o32, BBO
stxvw4x vs7, o48, BBO
addi BO, BO, 32
addi BBO, BBO, 64
stxvw4x vs12, o0, BBO
stxvw4x vs13, o16, BBO
stxvw4x vs14, o32, BBO
stxvw4x vs15, o48, BBO
addic. T1, T1, -8
addi BBO, BBO, 64
bge CGEMM_L4_COPYB
mr CO, C mr CO, C
mr AO, A mr AO, A
slwi T1, LDC , 2 slwi T1, LDC , 2
@ -48,7 +81,7 @@ CGEMM_L4_BEGIN:
CGEMM_L4x8_BEGIN: CGEMM_L4x8_BEGIN:
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L4x8_SUB0 ble CGEMM_L4x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START:
dcbt AO, PRE dcbt AO, PRE
dcbt BO, PRE dcbt BO, PRE
LOAD4x8_1 LOAD4x8_1
dcbt BO, PRE
KERNEL4x8_I1 KERNEL4x8_I1
dcbt BO, PRE
dcbt AO, PRE dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
dcbt BO, PRE
KERNEL4x8_1 KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
dcbt BO, PRE dcbt BO, PRE
KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE
KERNEL4x8_2
dcbt BO, PRE
KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START:
CGEMM_L4x8_LOOP: CGEMM_L4x8_LOOP:
dcbt BO, PRE
KERNEL4x8_1 KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
dcbt BO, PRE
KERNEL4x8_1 KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1
dcbt AO, PRE
dcbt BO, PRE dcbt BO, PRE
KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE
KERNEL4x8_2
dcbt BO, PRE
KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP:
CGEMM_L4x8_LOOP_END: CGEMM_L4x8_LOOP_END:
dcbt BO, PRE
KERNEL4x8_1 KERNEL4x8_1
dcbt BO, PRE
dcbt AO, PRE dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN:
andi. T1, M, 4 andi. T1, M, 4
ble CGEMM_L4x4_END ble CGEMM_L4x4_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L4x4_SUB0 ble CGEMM_L4x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble CGEMM_L4x2_END ble CGEMM_L4x2_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L4x2_SUB0 ble CGEMM_L4x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble CGEMM_L4x1_END ble CGEMM_L4x1_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L4x1_SUB0 ble CGEMM_L4x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -482,6 +531,39 @@ L999_H1:
CGEMM_L2_BEGIN: CGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 2
CGEMM_L2_COPYB:
dcbtst BBO, PRE
lxvw4x vs3, o0, BO
lxvw4x vs11, o16, BO
xxspltw vs4, vs3, 0
xxspltw vs5, vs3, 1
xxspltw vs6, vs3, 2
xxspltw vs7, vs3, 3
xxspltw vs12, vs11, 0
xxspltw vs13, vs11, 1
xxspltw vs14, vs11, 2
xxspltw vs15, vs11, 3
stxvw4x vs4, o0, BBO
stxvw4x vs5, o16, BBO
stxvw4x vs6, o32, BBO
stxvw4x vs7, o48, BBO
addi BO, BO, 32
addi BBO, BBO, 64
stxvw4x vs12, o0, BBO
stxvw4x vs13, o16, BBO
stxvw4x vs14, o32, BBO
stxvw4x vs15, o48, BBO
addic. T1, T1, -8
addi BBO, BBO, 64
bge CGEMM_L2_COPYB
andi. T1, N, 2 andi. T1, N, 2
ble CGEMM_L2_END ble CGEMM_L2_END
mr CO, C mr CO, C
@ -494,7 +576,7 @@ CGEMM_L2_BEGIN:
CGEMM_L2x8_BEGIN: CGEMM_L2x8_BEGIN:
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L2x8_SUB0 ble CGEMM_L2x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN:
andi. T1, M, 4 andi. T1, M, 4
ble CGEMM_L2x4_END ble CGEMM_L2x4_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L2x4_SUB0 ble CGEMM_L2x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble CGEMM_L2x2_END ble CGEMM_L2x2_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L2x2_SUB0 ble CGEMM_L2x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble CGEMM_L2x1_END ble CGEMM_L2x1_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L2x1_SUB0 ble CGEMM_L2x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -919,6 +1001,39 @@ L999_H2:
CGEMM_L1_BEGIN: CGEMM_L1_BEGIN:
mr BO, B
mr BBO, BBUFFER
slwi T1, K, 1
CGEMM_L1_COPYB:
dcbtst BBO, PRE
lxvw4x vs3, o0, BO
lxvw4x vs11, o16, BO
xxspltw vs4, vs3, 0
xxspltw vs5, vs3, 1
xxspltw vs6, vs3, 2
xxspltw vs7, vs3, 3
xxspltw vs12, vs11, 0
xxspltw vs13, vs11, 1
xxspltw vs14, vs11, 2
xxspltw vs15, vs11, 3
stxvw4x vs4, o0, BBO
stxvw4x vs5, o16, BBO
stxvw4x vs6, o32, BBO
stxvw4x vs7, o48, BBO
addi BO, BO, 32
addi BBO, BBO, 64
stxvw4x vs12, o0, BBO
stxvw4x vs13, o16, BBO
stxvw4x vs14, o32, BBO
stxvw4x vs15, o48, BBO
addic. T1, T1, -8
addi BBO, BBO, 64
bge CGEMM_L1_COPYB
andi. T1, N, 1 andi. T1, N, 1
ble CGEMM_L1_END ble CGEMM_L1_END
mr CO, C mr CO, C
@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN:
CGEMM_L1x8_BEGIN: CGEMM_L1x8_BEGIN:
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L1x8_SUB0 ble CGEMM_L1x8_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN:
andi. T1, M, 4 andi. T1, M, 4
ble CGEMM_L1x4_END ble CGEMM_L1x4_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L1x4_SUB0 ble CGEMM_L1x4_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN:
andi. T1, M, 2 andi. T1, M, 2
ble CGEMM_L1x2_END ble CGEMM_L1x2_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L1x2_SUB0 ble CGEMM_L1x2_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1
@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN:
andi. T1, M, 1 andi. T1, M, 1
ble CGEMM_L1x1_END ble CGEMM_L1x1_END
mr BO, B mr BO, BBUFFER
srawi. L, K, 3 srawi. L, K, 3
ble CGEMM_L1x1_SUB0 ble CGEMM_L1x1_SUB0
cmpwi cr0, L, 1 cmpwi cr0, L, 1

File diff suppressed because it is too large Load Diff

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/************************************************************************************** /**************************************************************************************
* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * 2016/04/04 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -275,7 +275,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#include "cgemm_macros_8x4_power8.S" #include "ctrmm_macros_8x4_power8.S"
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble L999_H1 ble L999_H1

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/************************************************************************************** /**************************************************************************************
* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * 2016/04/04 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK * BLASTEST : OK
* CTEST : OK * CTEST : OK
* TEST : OK * TEST : OK
@ -83,15 +83,22 @@ CTRMM_L4x8_BEGIN:
CTRMM_L4x8_LOOP_START: CTRMM_L4x8_LOOP_START:
dcbt AO, PRE
dcbt BO, PRE
LOAD4x8_1 LOAD4x8_1
KERNEL4x8_I1 KERNEL4x8_I1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL4x8_2 KERNEL4x8_2
addic. L, L, -2 addic. L, L, -2
@ -102,13 +109,18 @@ CTRMM_L4x8_LOOP_START:
CTRMM_L4x8_LOOP: CTRMM_L4x8_LOOP:
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
dcbt BO, PRE
KERNEL4x8_2 KERNEL4x8_2
addic. L, L, -1 addic. L, L, -1
@ -117,8 +129,10 @@ CTRMM_L4x8_LOOP:
CTRMM_L4x8_LOOP_END: CTRMM_L4x8_LOOP_END:
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1
dcbt AO, PRE
KERNEL4x8_2 KERNEL4x8_2
KERNEL4x8_1 KERNEL4x8_1

File diff suppressed because it is too large Load Diff

View File

@ -1979,7 +1979,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P 960 #define SGEMM_DEFAULT_P 960
#define DGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 480
#define CGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 720
#define ZGEMM_DEFAULT_P 240 #define ZGEMM_DEFAULT_P 240
#define SGEMM_DEFAULT_Q 720 #define SGEMM_DEFAULT_Q 720