Merge pull request #879 from wernsaar/develop

optimized dgemm and dgetrf for POWER8
This commit is contained in:
Werner Saar 2016-05-17 17:10:36 +02:00
commit 8a149e6294
8 changed files with 97 additions and 62 deletions

View File

@ -332,6 +332,13 @@ typedef int blasint;
#endif #endif
#endif #endif
#ifdef POWER8
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
/* /*
#ifdef PILEDRIVER #ifdef PILEDRIVER
#ifndef YIELDING #ifndef YIELDING

View File

@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* LAPACK-TEST : OK * LAPACK-TEST : OK
**************************************************************************************/ **************************************************************************************/
#define MY_ALIGN .align 3
srawi. J, N, 2 srawi. J, N, 2
ble LDGEMM_L4_END ble LDGEMM_L4_END
@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN:
srawi. I, M, 4 srawi. I, M, 4
ble LDGEMM_L4x16_END ble LDGEMM_L4x16_END
.align 4 MY_ALIGN
LDGEMM_L4x16_BEGIN_FIRST: LDGEMM_L4x16_BEGIN_FIRST:
li L, -128 li L, -128
@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST:
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble LDGEMM_L4x16_SUB4_FIRST ble LDGEMM_L4x16_SUB4_FIRST
.align 4 MY_ALIGN
LDGEMM_L4x16_LOOP_START_FIRST: LDGEMM_L4x16_LOOP_START_FIRST:
li T2, 512 li T2, 512
@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST:
ble LDGEMM_L4x16_LOOP_END_FIRST ble LDGEMM_L4x16_LOOP_END_FIRST
mtctr L mtctr L
.align 4 MY_ALIGN
LDGEMM_L4x16_LOOP_FIRST: LDGEMM_L4x16_LOOP_FIRST:
@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST:
bdnz LDGEMM_L4x16_LOOP_FIRST bdnz LDGEMM_L4x16_LOOP_FIRST
.align 4 MY_ALIGN
LDGEMM_L4x16_LOOP_END_FIRST: LDGEMM_L4x16_LOOP_END_FIRST:
@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST:
addic. L, L, -1 addic. L, L, -1
bgt LDGEMM_L4x16_SUB2_FIRST bgt LDGEMM_L4x16_SUB2_FIRST
.align 4 MY_ALIGN
LDGEMM_L4x16_SAVE_FIRST: LDGEMM_L4x16_SAVE_FIRST:
SAVE4x16 SAVE4x16
@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST:
LDGEMM_L4x16_END_FIRST: LDGEMM_L4x16_END_FIRST:
.align 4 MY_ALIGN
LDGEMM_L4x16_BEGIN: LDGEMM_L4x16_BEGIN:
li L, -128 li L, -128
@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN:
cmpwi cr0, L, 1 cmpwi cr0, L, 1
ble- LDGEMM_L4x16_SUB4 ble- LDGEMM_L4x16_SUB4
.align 4 MY_ALIGN
LDGEMM_L4x16_LOOP_START: LDGEMM_L4x16_LOOP_START:
li o40, 40 li o40, 40
@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START:
ble- LDGEMM_L4x16_LOOP_END ble- LDGEMM_L4x16_LOOP_END
mtctr L mtctr L
.align 4 MY_ALIGN
LDGEMM_L4x16_LOOP: LDGEMM_L4x16_LOOP:
dcbt AO, PRE dcbt AO, PRE
KERNEL4x16_L1 KERNEL4x16_L1
dcbt AO, PRE dcbt AO, PRE
// addic. L, L, -1
KERNEL4x16_L2 KERNEL4x16_L2
bdnz+ LDGEMM_L4x16_LOOP bdnz+ LDGEMM_L4x16_LOOP
.align 4
MY_ALIGN
LDGEMM_L4x16_LOOP_END: LDGEMM_L4x16_LOOP_END:
@ -261,6 +263,8 @@ LDGEMM_L4x16_LOOP_END:
b LDGEMM_L4x16_SUB1 b LDGEMM_L4x16_SUB1
MY_ALIGN
LDGEMM_L4x16_SUB4: LDGEMM_L4x16_SUB4:
KERNEL4x16_SUBI1 KERNEL4x16_SUBI1
@ -268,6 +272,8 @@ LDGEMM_L4x16_SUB4:
b LDGEMM_L4x16_SUB1 b LDGEMM_L4x16_SUB1
MY_ALIGN
LDGEMM_L4x16_SUB0: LDGEMM_L4x16_SUB0:
andi. L, K, 1 andi. L, K, 1
@ -278,11 +284,15 @@ LDGEMM_L4x16_SUB0:
ble LDGEMM_L4x16_SAVE ble LDGEMM_L4x16_SAVE
b LDGEMM_L4x16_SUB2 b LDGEMM_L4x16_SUB2
MY_ALIGN
LDGEMM_L4x16_SUB1: LDGEMM_L4x16_SUB1:
andi. L, K, 1 andi. L, K, 1
ble LDGEMM_L4x16_SAVE ble LDGEMM_L4x16_SAVE
MY_ALIGN
LDGEMM_L4x16_SUB2: LDGEMM_L4x16_SUB2:
KERNEL4x16_SUB1 KERNEL4x16_SUB1
@ -290,7 +300,8 @@ LDGEMM_L4x16_SUB2:
addic. L, L, -1 addic. L, L, -1
bgt LDGEMM_L4x16_SUB2 bgt LDGEMM_L4x16_SUB2
.align 4 MY_ALIGN
LDGEMM_L4x16_SAVE: LDGEMM_L4x16_SAVE:
SAVE4x16 SAVE4x16
@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x8_LOOP_END ble LDGEMM_L4x8_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L4x8_LOOP: LDGEMM_L4x8_LOOP:
@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x4_LOOP_END ble LDGEMM_L4x4_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L4x4_LOOP: LDGEMM_L4x4_LOOP:
@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x2_LOOP_END ble LDGEMM_L4x2_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L4x2_LOOP: LDGEMM_L4x2_LOOP:
@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L4x1_LOOP_END ble LDGEMM_L4x1_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L4x1_LOOP: LDGEMM_L4x1_LOOP:
@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x16_LOOP_END ble LDGEMM_L2x16_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L2x16_LOOP: LDGEMM_L2x16_LOOP:
@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x8_LOOP_END ble LDGEMM_L2x8_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L2x8_LOOP: LDGEMM_L2x8_LOOP:
@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x4_LOOP_END ble LDGEMM_L2x4_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L2x4_LOOP: LDGEMM_L2x4_LOOP:
@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x2_LOOP_END ble LDGEMM_L2x2_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L2x2_LOOP: LDGEMM_L2x2_LOOP:
@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L2x1_LOOP_END ble LDGEMM_L2x1_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L2x1_LOOP: LDGEMM_L2x1_LOOP:
@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x16_LOOP_END ble LDGEMM_L1x16_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L1x16_LOOP: LDGEMM_L1x16_LOOP:
@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x8_LOOP_END ble LDGEMM_L1x8_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L1x8_LOOP: LDGEMM_L1x8_LOOP:
@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x4_LOOP_END ble LDGEMM_L1x4_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L1x4_LOOP: LDGEMM_L1x4_LOOP:
@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x2_LOOP_END ble LDGEMM_L1x2_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L1x2_LOOP: LDGEMM_L1x2_LOOP:
@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START:
addic. L, L, -2 addic. L, L, -2
ble LDGEMM_L1x1_LOOP_END ble LDGEMM_L1x1_LOOP_END
.align 5 MY_ALIGN
LDGEMM_L1x1_LOOP: LDGEMM_L1x1_LOOP:

View File

@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs62, vs7, vs15, 3 xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3 xxpermdi vs63, vs23, vs31, 3
dcbt BO, PREB
stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO stxvd2x vs33, o16, BO
@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs39, o112, BO stxvd2x vs39, o112, BO
addi BO, BO, 128 addi BO, BO, 128
dcbt BO, PREB
stxvd2x vs40, o0, BO stxvd2x vs40, o0, BO
stxvd2x vs41, o16, BO stxvd2x vs41, o16, BO
stxvd2x vs42, o32, BO stxvd2x vs42, o32, BO
@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs47, o112, BO stxvd2x vs47, o112, BO
addi BO, BO, 128 addi BO, BO, 128
dcbt BO, PREB
stxvd2x vs48, o0, BO stxvd2x vs48, o0, BO
stxvd2x vs49, o16, BO stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO stxvd2x vs50, o32, BO
@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs55, o112, BO stxvd2x vs55, o112, BO
addi BO, BO, 128 addi BO, BO, 128
dcbt BO, PREB
stxvd2x vs56, o0, BO stxvd2x vs56, o0, BO
stxvd2x vs57, o16, BO stxvd2x vs57, o16, BO
stxvd2x vs58, o32, BO stxvd2x vs58, o32, BO

View File

@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add B2, B2, B add B2, B2, B
add B1, B1, B add B1, B1, B
li PREA, 256 li PREA, 384
addi PREB, M16, 128 addi PREB, M16, 128
li o8, 8 li o8, 8

View File

@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN:
ble DCOPYT_L4x8_BEGIN ble DCOPYT_L4x8_BEGIN
mr BO, B16 mr BO, B16
addi T2, M16, 384
mtctr J
.align 5 .align 5
DCOPYT_L4x16_LOOP: DCOPYT_L4x16_LOOP:
/* addi T1, M16, 256
addi T1, PREB, 128
addi T2, PREB, 256
*/
dcbt A0, PREA dcbt A0, PREA
dcbt A1, PREA dcbt A1, PREA
dcbt A2, PREA dcbt A2, PREA
dcbt A3, PREA dcbt A3, PREA
/*
dcbtst BO, M16 dcbt BO, M16
dcbtst BO, PREB dcbt BO, PREB
dcbtst BO, T1 dcbt BO, T1
dcbtst BO, T2 dcbt BO, T2
*/
COPY_4x16 COPY_4x16
add BO, BO, M16 add BO, BO, M16
addic. J, J, -1 // addic. J, J, -1
bgt DCOPYT_L4x16_LOOP bdnz+ DCOPYT_L4x16_LOOP
DCOPYT_L4x8_BEGIN: DCOPYT_L4x8_BEGIN:

View File

@ -46,52 +46,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs35, o48, A0 lxvd2x vs35, o48, A0
addi A0, A0, 64 addi A0, A0, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
lxvd2x vs40, o0, A1 lxvd2x vs40, o0, A1
lxvd2x vs41, o16, A1 lxvd2x vs41, o16, A1
lxvd2x vs42, o32, A1 lxvd2x vs42, o32, A1
lxvd2x vs43, o48, A1 lxvd2x vs43, o48, A1
addi A1, A1, 64 addi A1, A1, 64
lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64
lxvd2x vs48, o0, A2 lxvd2x vs48, o0, A2
lxvd2x vs49, o16, A2 lxvd2x vs49, o16, A2
lxvd2x vs50, o32, A2 lxvd2x vs50, o32, A2
lxvd2x vs51, o48, A2 lxvd2x vs51, o48, A2
addi A2, A2, 64 addi A2, A2, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
addi A2, A2, 64
lxvd2x vs56, o0, A3 lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3 lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3 lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3 lxvd2x vs59, o48, A3
addi A3, A3, 64 addi A3, A3, 64
lxvd2x vs36, o0, A0
lxvd2x vs37, o16, A0
lxvd2x vs38, o32, A0
lxvd2x vs39, o48, A0
addi A0, A0, 64
lxvd2x vs44, o0, A1
lxvd2x vs45, o16, A1
lxvd2x vs46, o32, A1
lxvd2x vs47, o48, A1
addi A1, A1, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
addi A2, A2, 64
lxvd2x vs60, o0, A3 lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3 lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3 lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3 lxvd2x vs63, o48, A3
addi A3, A3, 64 addi A3, A3, 64
mr T1, BO mr T1, BO
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1

View File

@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q; if (blocking > GEMM_Q) blocking = GEMM_Q;
if (blocking <= GEMM_UNROLL_N * 2) { #ifdef POWER8
if (blocking <= GEMM_UNROLL_N) {
info = GETF2(args, NULL, range_n, sa, sb, 0); info = GETF2(args, NULL, range_n, sa, sb, 0);
return info; return info;
} }
#else
if (blocking <= GEMM_UNROLL_N*2) {
info = GETF2(args, NULL, range_n, sa, sb, 0);
return info;
}
#endif
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);

View File

@ -77,10 +77,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
if (blocking > GEMM_Q) blocking = GEMM_Q; if (blocking > GEMM_Q) blocking = GEMM_Q;
#ifdef POWER8
if (blocking <= GEMM_UNROLL_N) {
info = GETF2(args, NULL, range_n, sa, sb, 0);
return info;
}
#else
if (blocking <= GEMM_UNROLL_N * 2) { if (blocking <= GEMM_UNROLL_N * 2) {
info = GETF2(args, NULL, range_n, sa, sb, 0); info = GETF2(args, NULL, range_n, sa, sb, 0);
return info; return info;
} }
#endif
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);