updated optimized cgemm- and ctrmm-kernel for POWER8
This commit is contained in:
parent
e1cdd15b30
commit
c5b1fbcb2e
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -137,12 +137,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define alpha_si vs31
|
||||
|
||||
|
||||
#define NOTUSED r14
|
||||
#define BBUFFER r14
|
||||
#define L r15
|
||||
#define o12 r16
|
||||
#define o4 r17
|
||||
#define T2 r19
|
||||
#define KK r20
|
||||
#define BBO r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
|
@ -290,6 +290,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
li o32 , 32
|
||||
li o48 , 48
|
||||
|
||||
li T1, 256
|
||||
slwi T1, T1, 9 // 131072
|
||||
sub BBUFFER, A, T1 // temp buffer for B unrolled
|
||||
|
||||
|
||||
#ifdef __64BIT__
|
||||
addi T1 , SP, 296
|
||||
|
|
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
CGEMM_L4_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
slwi T1, K, 3
|
||||
|
||||
CGEMM_L4_COPYB:
|
||||
dcbtst BBO, PRE
|
||||
|
||||
lxvw4x vs3, o0, BO
|
||||
lxvw4x vs11, o16, BO
|
||||
xxspltw vs4, vs3, 0
|
||||
xxspltw vs5, vs3, 1
|
||||
xxspltw vs6, vs3, 2
|
||||
xxspltw vs7, vs3, 3
|
||||
xxspltw vs12, vs11, 0
|
||||
xxspltw vs13, vs11, 1
|
||||
xxspltw vs14, vs11, 2
|
||||
xxspltw vs15, vs11, 3
|
||||
stxvw4x vs4, o0, BBO
|
||||
stxvw4x vs5, o16, BBO
|
||||
stxvw4x vs6, o32, BBO
|
||||
stxvw4x vs7, o48, BBO
|
||||
addi BO, BO, 32
|
||||
addi BBO, BBO, 64
|
||||
stxvw4x vs12, o0, BBO
|
||||
stxvw4x vs13, o16, BBO
|
||||
stxvw4x vs14, o32, BBO
|
||||
stxvw4x vs15, o48, BBO
|
||||
addic. T1, T1, -8
|
||||
addi BBO, BBO, 64
|
||||
|
||||
bge CGEMM_L4_COPYB
|
||||
|
||||
|
||||
mr CO, C
|
||||
mr AO, A
|
||||
slwi T1, LDC , 2
|
||||
|
@ -48,7 +81,7 @@ CGEMM_L4_BEGIN:
|
|||
CGEMM_L4x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L4x8_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START:
|
|||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
LOAD4x8_1
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_I1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
|
@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START:
|
|||
|
||||
CGEMM_L4x8_LOOP:
|
||||
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
|
@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP:
|
|||
|
||||
CGEMM_L4x8_LOOP_END:
|
||||
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_1
|
||||
dcbt BO, PRE
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
|
@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN:
|
|||
|
||||
andi. T1, M, 4
|
||||
ble CGEMM_L4x4_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L4x4_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN:
|
|||
|
||||
andi. T1, M, 2
|
||||
ble CGEMM_L4x2_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L4x2_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN:
|
|||
|
||||
andi. T1, M, 1
|
||||
ble CGEMM_L4x1_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L4x1_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -482,6 +531,39 @@ L999_H1:
|
|||
|
||||
CGEMM_L2_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
slwi T1, K, 2
|
||||
|
||||
CGEMM_L2_COPYB:
|
||||
dcbtst BBO, PRE
|
||||
|
||||
lxvw4x vs3, o0, BO
|
||||
lxvw4x vs11, o16, BO
|
||||
xxspltw vs4, vs3, 0
|
||||
xxspltw vs5, vs3, 1
|
||||
xxspltw vs6, vs3, 2
|
||||
xxspltw vs7, vs3, 3
|
||||
xxspltw vs12, vs11, 0
|
||||
xxspltw vs13, vs11, 1
|
||||
xxspltw vs14, vs11, 2
|
||||
xxspltw vs15, vs11, 3
|
||||
stxvw4x vs4, o0, BBO
|
||||
stxvw4x vs5, o16, BBO
|
||||
stxvw4x vs6, o32, BBO
|
||||
stxvw4x vs7, o48, BBO
|
||||
addi BO, BO, 32
|
||||
addi BBO, BBO, 64
|
||||
stxvw4x vs12, o0, BBO
|
||||
stxvw4x vs13, o16, BBO
|
||||
stxvw4x vs14, o32, BBO
|
||||
stxvw4x vs15, o48, BBO
|
||||
addic. T1, T1, -8
|
||||
addi BBO, BBO, 64
|
||||
|
||||
bge CGEMM_L2_COPYB
|
||||
|
||||
|
||||
andi. T1, N, 2
|
||||
ble CGEMM_L2_END
|
||||
mr CO, C
|
||||
|
@ -494,7 +576,7 @@ CGEMM_L2_BEGIN:
|
|||
CGEMM_L2x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L2x8_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN:
|
|||
|
||||
andi. T1, M, 4
|
||||
ble CGEMM_L2x4_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L2x4_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN:
|
|||
|
||||
andi. T1, M, 2
|
||||
ble CGEMM_L2x2_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L2x2_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN:
|
|||
|
||||
andi. T1, M, 1
|
||||
ble CGEMM_L2x1_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L2x1_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -919,6 +1001,39 @@ L999_H2:
|
|||
|
||||
CGEMM_L1_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
slwi T1, K, 1
|
||||
|
||||
CGEMM_L1_COPYB:
|
||||
dcbtst BBO, PRE
|
||||
|
||||
lxvw4x vs3, o0, BO
|
||||
lxvw4x vs11, o16, BO
|
||||
xxspltw vs4, vs3, 0
|
||||
xxspltw vs5, vs3, 1
|
||||
xxspltw vs6, vs3, 2
|
||||
xxspltw vs7, vs3, 3
|
||||
xxspltw vs12, vs11, 0
|
||||
xxspltw vs13, vs11, 1
|
||||
xxspltw vs14, vs11, 2
|
||||
xxspltw vs15, vs11, 3
|
||||
stxvw4x vs4, o0, BBO
|
||||
stxvw4x vs5, o16, BBO
|
||||
stxvw4x vs6, o32, BBO
|
||||
stxvw4x vs7, o48, BBO
|
||||
addi BO, BO, 32
|
||||
addi BBO, BBO, 64
|
||||
stxvw4x vs12, o0, BBO
|
||||
stxvw4x vs13, o16, BBO
|
||||
stxvw4x vs14, o32, BBO
|
||||
stxvw4x vs15, o48, BBO
|
||||
addic. T1, T1, -8
|
||||
addi BBO, BBO, 64
|
||||
|
||||
bge CGEMM_L1_COPYB
|
||||
|
||||
|
||||
andi. T1, N, 1
|
||||
ble CGEMM_L1_END
|
||||
mr CO, C
|
||||
|
@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN:
|
|||
CGEMM_L1x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L1x8_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN:
|
|||
|
||||
andi. T1, M, 4
|
||||
ble CGEMM_L1x4_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L1x4_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN:
|
|||
|
||||
andi. T1, M, 2
|
||||
ble CGEMM_L1x2_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L1x2_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN:
|
|||
|
||||
andi. T1, M, 1
|
||||
ble CGEMM_L1x1_END
|
||||
mr BO, B
|
||||
mr BO, BBUFFER
|
||||
srawi. L, K, 3
|
||||
ble CGEMM_L1x1_SUB0
|
||||
cmpwi cr0, L, 1
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -275,7 +275,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#include "cgemm_macros_8x4_power8.S"
|
||||
#include "ctrmm_macros_8x4_power8.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999_H1
|
||||
|
|
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
|
||||
* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
|
@ -83,15 +83,22 @@ CTRMM_L4x8_BEGIN:
|
|||
|
||||
CTRMM_L4x8_LOOP_START:
|
||||
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
LOAD4x8_1
|
||||
KERNEL4x8_I1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
addic. L, L, -2
|
||||
|
@ -102,13 +109,18 @@ CTRMM_L4x8_LOOP_START:
|
|||
CTRMM_L4x8_LOOP:
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
addic. L, L, -1
|
||||
|
@ -117,8 +129,10 @@ CTRMM_L4x8_LOOP:
|
|||
CTRMM_L4x8_LOOP_END:
|
||||
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
KERNEL4x8_1
|
||||
dcbt AO, PRE
|
||||
KERNEL4x8_2
|
||||
|
||||
KERNEL4x8_1
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue