improved zgemm power9 based on power8

This commit is contained in:
AbdelRauf 2019-05-23 04:23:43 +00:00
parent 47f892198c
commit 8fe794f059
7 changed files with 2802 additions and 24 deletions

View File

@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy.o
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMKERNEL = zgemm_kernel_power9.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c

View File

@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
xscvdpspn alpha_r,vs1
xxspltw alpha_r,alpha_r,0

View File

@ -53,9 +53,9 @@ LSGEMM_L8x16_BEGIN:
LSGEMM_L8x16_LOOP_START:
LOAD8x16_0 /*we already zeroed */
##OffsetA=64 OffsetB=32
addi AO,AO,2112
addi BO,BO,32
/*##OffsetA=64 OffsetB=32
#addi AO,AO,2112
#addi BO,BO,32 */
mtctr L
@ -63,29 +63,29 @@ LSGEMM_L8x16_LOOP_START:
LSGEMM_L8x16_LOOP:
KERNEL8x16_I1_L4_2 -2048,0, 0,0
KERNEL8x16_I1_L4_2 -2048,0, 1,0
KERNEL8x16_I1_L4_2 -2048,0, 2,0
KERNEL8x16_I1_L4_2 -2048,0, 3,0
KERNEL8x16_I1_L4_2 -2048,0, 4,0
KERNEL8x16_I1_L4_2 -2048,0, 5,0
KERNEL8x16_I1_L4_2 -2048,0, 6,0
KERNEL8x16_I1_L4_2 -2048,0, 7,0
KERNEL8x16_I1_L4_2 -2048,0, 8,0
KERNEL8x16_I1_L4_2 -2048,0, 9,0
KERNEL8x16_I1_L4_2 -2048,0, 10,0
KERNEL8x16_I1_L4_2 -2048,0, 11,0
KERNEL8x16_I1_L4_2 -2048,0, 12,0
KERNEL8x16_I1_L4_2 -2048,0, 13,0
KERNEL8x16_I1_L4_2 -2048,0, 14,0
KERNEL8x16_I1_L4_2 -2048,0, 15,1
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_2 64,32, 7,0
KERNEL8x16_I1_L4_2 64,32, 8,0
KERNEL8x16_I1_L4_2 64,32, 9,0
KERNEL8x16_I1_L4_2 64,32, 10,0
KERNEL8x16_I1_L4_2 64,32, 11,0
KERNEL8x16_I1_L4_2 64,32, 12,0
KERNEL8x16_I1_L4_2 64,32, 13,0
KERNEL8x16_I1_L4_2 64,32, 14,0
KERNEL8x16_I1_L4_2 64,32, 15,1
bdnz LSGEMM_L8x16_LOOP
MY_ALIGN
LSGEMM_L8x16_LOOP_END:
END8x16 0, AO, BO, -2048, 0
END8x16 0, AO, BO, 64, 32
b LSGEMM_L8x16_SUB1
MY_ALIGN

View File

@ -0,0 +1,257 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define LOAD ld
#define STACKSIZE 32192
#define FZERO 312+192(SP)
#define M r3
#define N r4
#define K r5
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#define o0 0
#define alpha_r vs30
#define alpha_i vs31
#define VECSAVE r11
#define FRAMEPOINTER r12
#define BBUFFER r14
#define L r15
#define ALPHA r16
#define T5 r17
#define T2 r19
#define BBO r20
#define o8 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define T3 r28
#define T4 r29
#define PRE r30
#define T1 r31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
mr FRAMEPOINTER, SP
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
stxv v20, 288(SP)
stxv v21, 304(SP)
stxv v22, 320(SP)
stxv v23, 336(SP)
stxv v24, 352(SP)
stxv v25, 368(SP)
stxv v26, 384(SP)
stxv v27, 400(SP)
stxv v28, 416(SP)
stxv v29, 432(SP)
stxv v30, 448(SP)
stxv v31, 464(SP)
stw r0, FZERO
#ifdef linux
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif
#endif
#include "zgemm_macros_power9.S"
cmpwi cr0, M, 0
ble L999
cmpwi cr0, N, 0
ble L999
cmpwi cr0, K, 0
ble L999
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 512
li o8 , 8
li o16 , 16
addi BBUFFER, SP, 512+4096
li T1, -4096
and BBUFFER, BBUFFER, T1
addi ALPHA, SP, 296+192
xxlor alpha_r,vs1,vs1 /*copy from register f1 */
xxlor alpha_i,vs2,vs2 /*copy from register f2 */
.align 4
#include "zgemm_logic_power9.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
lxv v20, 288(SP)
lxv v21, 304(SP)
lxv v22, 320(SP)
lxv v23, 336(SP)
lxv v24, 352(SP)
lxv v25, 368(SP)
lxv v26, 384(SP)
lxv v27, 400(SP)
lxv v28, 416(SP)
lxv v29, 432(SP)
lxv v30, 448(SP)
lxv v31, 464(SP)
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

View File

@ -0,0 +1,857 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define MY_ALIGN .align 3
srawi. J, N, 1
ble ZGEMM_L2_END
ZGEMM_L2_BEGIN:
mr BO, B
mr BBO, BBUFFER
srawi. T1, K, 2
ble ZGEMM_L2_COPYB1
ZGEMM_L2_COPYB8:
addi T2, PRE, 128
dcbt BO, PRE
dcbtst BBO, PRE
dcbtst BBO, T2
ZCOPYB_8
addic. T1, T1, -1
bgt ZGEMM_L2_COPYB8
ZGEMM_L2_COPYB1:
andi. T1, K, 3
ble ZGEMM_L2_COPYB_END
ZGEMM_L2_COPYB_LOOP:
ZCOPYB_2
addic. T1, T1, -1
bgt ZGEMM_L2_COPYB_LOOP
ZGEMM_L2_COPYB_END:
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
srawi. I, M, 3
ble ZGEMM_L2x8_END
ZGEMM_L2x8_BEGIN:
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 32x */
ZERO2x8
ble ZGEMM_L2x8_SUB0
ZGEMM_L2x8_LOOP_START:
LOAD2x8 0
li T2, 1024
li T3, 1024+512
li T4, 2048
li T5, 2048+512
mtctr L
MY_ALIGN
ZGEMM_L2x8_LOOP:
dcbt AO, PRE
dcbt BO, PRE
KERNEL2x8_L 128,64,0,0
KERNEL2x8_L 128,64,1,0
dcbt AO, T2
KERNEL2x8_L 128,64,2,0
KERNEL2x8_L 128,64,3,0
dcbt AO, T3
dcbt BO, T2
KERNEL2x8_L 128,64,4,0
KERNEL2x8_L 128,64,5,0
dcbt AO, T4
KERNEL2x8_L 128,64,6,0
KERNEL2x8_L 128,64,7,0
dcbt AO, T5
dcbt BO, T3
KERNEL2x8_L 128,64,8,0
KERNEL2x8_L 128,64,9,0
KERNEL2x8_L 128,64,10,0
KERNEL2x8_L 128,64,11,0
dcbt BO, T4
KERNEL2x8_L 128,64,12,0
KERNEL2x8_L 128,64,13,0
KERNEL2x8_L 128,64,14,0
KERNEL2x8_L 128,64,15,1
bdnz ZGEMM_L2x8_LOOP
MY_ALIGN
ZGEMM_L2x8_LOOP_END:
END2x8 AO, BO, 128, 64
b ZGEMM_L2x8_SUB1
ZGEMM_L2x8_SUB0:
andi. L, K, 63
b ZGEMM_L2x8_SUB2
ZGEMM_L2x8_SUB1:
andi. L, T1, 31
ble ZGEMM_L2x8_SAVE
ZGEMM_L2x8_SUB2:
srawi. T1,L, 3
ble ZGEMM_L2x8_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L2x8_SUB2_LOOP:
LOAD2x8 0
KERNEL2x8_L 128,64, 0,0
KERNEL2x8_L 128,64, 1,0
KERNEL2x8_L 128,64, 2,0
KERNEL2x8_E 128,64, 3,1
bdnz ZGEMM_L2x8_SUB2_LOOP
MY_ALIGN
ZGEMM_L2x8_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x8_SUB2_2
LOAD2x8 0
KERNEL2x8_L 128,64, 0,0
KERNEL2x8_E 128,64, 1,1
MY_ALIGN
ZGEMM_L2x8_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x8_SUB2_1
LOAD2x8 0
KERNEL2x8_E 128,64, 0,1
MY_ALIGN
ZGEMM_L2x8_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L2x8_SAVE
KERNEL2x8
/* addic. L, L, -1
bgt ZGEMM_L2x8_SUB2_1*/
ZGEMM_L2x8_SAVE:
SAVE2x8
addic. I, I, -1
bgt ZGEMM_L2x8_BEGIN
ZGEMM_L2x8_END:
ZGEMM_L2x4_BEGIN:
andi. T2, M, 7
ble ZGEMM_L2x1_END
andi. T1, M, 4
ble ZGEMM_L2x4_END
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 4 /**(K-1) % 16x */
ZERO2x4
ble ZGEMM_L2x4_SUB0
ZGEMM_L2x4_LOOP_START:
LOAD2x4 0
mtctr L
MY_ALIGN
ZGEMM_L2x4_LOOP:
KERNEL2x4_L 64,64,0,0
KERNEL2x4_L 64,64,1,0
KERNEL2x4_L 64,64,2,0
KERNEL2x4_L 64,64,3,0
KERNEL2x4_L 64,64,4,0
KERNEL2x4_L 64,64,5,0
KERNEL2x4_L 64,64,6,0
KERNEL2x4_L 64,64,7,1
bdnz ZGEMM_L2x4_LOOP
MY_ALIGN
ZGEMM_L2x4_LOOP_END:
END2x4 AO, BO, 64, 64
b ZGEMM_L2x4_SUB1
ZGEMM_L2x4_SUB0:
andi. L, K, 31
b ZGEMM_L2x4_SUB2
ZGEMM_L2x4_SUB1:
andi. L, T1, 15
ble ZGEMM_L2x4_SAVE
ZGEMM_L2x4_SUB2:
srawi. T1,L, 3
ble ZGEMM_L2x4_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L2x4_SUB2_LOOP:
LOAD2x4 0
KERNEL2x4_L 64,64, 0,0
KERNEL2x4_L 64,64, 1,0
KERNEL2x4_L 64,64, 2,0
KERNEL2x4_E 64,64, 3,1
bdnz ZGEMM_L2x4_SUB2_LOOP
MY_ALIGN
ZGEMM_L2x4_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x4_SUB2_2
LOAD2x4 0
KERNEL2x4_L 64,64, 0,0
KERNEL2x4_E 64,64, 1,1
MY_ALIGN
ZGEMM_L2x4_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x4_SUB2_1
LOAD2x4 0
KERNEL2x4_E 64,64, 0,1
MY_ALIGN
ZGEMM_L2x4_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L2x4_SAVE
KERNEL2x4
ZGEMM_L2x4_SAVE:
SAVE2x4
ZGEMM_L2x4_END:
ZGEMM_L2x2_BEGIN:
andi. T1, M, 2
ble ZGEMM_L2x2_END
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 4 /**(K-1) % 16x */
ZERO2x2
ble ZGEMM_L2x2_SUB0
ZGEMM_L2x2_LOOP_START:
LOAD2x2 0
mtctr L
MY_ALIGN
ZGEMM_L2x2_LOOP:
KERNEL2x2_L 32,64,0,0
KERNEL2x2_L 32,64,1,0
KERNEL2x2_L 32,64,2,0
KERNEL2x2_L 32,64,3,0
KERNEL2x2_L 32,64,4,0
KERNEL2x2_L 32,64,5,0
KERNEL2x2_L 32,64,6,0
KERNEL2x2_L 32,64,7,1
bdnz ZGEMM_L2x2_LOOP
MY_ALIGN
ZGEMM_L2x2_LOOP_END:
END2x2 AO, BO, 32, 64
b ZGEMM_L2x2_SUB1
ZGEMM_L2x2_SUB0:
andi. L, K, 31
b ZGEMM_L2x2_SUB2
ZGEMM_L2x2_SUB1:
andi. L, T1, 15
ble ZGEMM_L2x2_SAVE
ZGEMM_L2x2_SUB2:
srawi. T1,L, 3
ble ZGEMM_L2x2_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L2x2_SUB2_LOOP:
LOAD2x2 0
KERNEL2x2_L 32,64, 0,0
KERNEL2x2_L 32,64, 1,0
KERNEL2x2_L 32,64, 2,0
KERNEL2x2_E 32,64, 3,1
bdnz ZGEMM_L2x2_SUB2_LOOP
MY_ALIGN
ZGEMM_L2x2_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x2_SUB2_2
LOAD2x2 0
KERNEL2x2_L 32,64, 0,0
KERNEL2x2_E 32,64, 1,1
MY_ALIGN
ZGEMM_L2x2_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x2_SUB2_1
LOAD2x2 0
KERNEL2x2_E 32,64, 0,1
MY_ALIGN
ZGEMM_L2x2_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L2x2_SAVE
KERNEL2x2
ZGEMM_L2x2_SAVE:
SAVE2x2
ZGEMM_L2x2_END:
ZGEMM_L2x1_BEGIN:
andi. T1, M, 1
ble ZGEMM_L2x1_END
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 4 /**(K-1) % 16x */
ZERO2x1
ble ZGEMM_L2x1_SUB0
ZGEMM_L2x1_LOOP_START:
LOAD2x1 0
mtctr L
MY_ALIGN
ZGEMM_L2x1_LOOP:
KERNEL2x1_L 16,64,0,0
KERNEL2x1_L 16,64,1,0
KERNEL2x1_L 16,64,2,0
KERNEL2x1_L 16,64,3,0
KERNEL2x1_L 16,64,4,0
KERNEL2x1_L 16,64,5,0
KERNEL2x1_L 16,64,6,0
KERNEL2x1_L 16,64,7,1
bdnz ZGEMM_L2x1_LOOP
MY_ALIGN
ZGEMM_L2x1_LOOP_END:
END2x1 AO, BO, 16, 64
b ZGEMM_L2x1_SUB1
ZGEMM_L2x1_SUB0:
andi. L, K, 31
b ZGEMM_L2x1_SUB2
ZGEMM_L2x1_SUB1:
andi. L, T1, 15
ble ZGEMM_L2x1_SAVE
ZGEMM_L2x1_SUB2:
srawi. T1,L, 3
ble ZGEMM_L2x1_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L2x1_SUB2_LOOP:
LOAD2x1 0
KERNEL2x1_L 16,64, 0,0
KERNEL2x1_L 16,64, 1,0
KERNEL2x1_L 16,64, 2,0
KERNEL2x1_E 16,64, 3,1
bdnz ZGEMM_L2x1_SUB2_LOOP
MY_ALIGN
ZGEMM_L2x1_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L2x1_SUB2_2
LOAD2x1 0
KERNEL2x1_L 16,64, 0,0
KERNEL2x1_E 16,64, 1,1
MY_ALIGN
ZGEMM_L2x1_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L2x1_SUB2_1
LOAD2x1 0
KERNEL2x1_E 16,64, 0,1
MY_ALIGN
ZGEMM_L2x1_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L2x1_SAVE
KERNEL2x1
ZGEMM_L2x1_SAVE:
SAVE2x1
ZGEMM_L2x1_END:
slwi T1, K, 5
add B, B, T1
addic. J, J, -1
bgt ZGEMM_L2_BEGIN
andi. T2, N, 1
ble L999
ZGEMM_L2_END:
b ZGEMM_L1_BEGIN
L999_H1:
b L999
ZGEMM_L1_BEGIN:
andi. T1, N, 1
ble ZGEMM_L1_END
mr BO, B
mr BBO, BBUFFER
srawi. T1, K, 3 /*this time K/8 */
ble ZGEMM_L1_COPYB1
ZGEMM_L1_COPYB8:
addi T2, PRE, 128
dcbt BO, PRE
dcbtst BBO, PRE
dcbtst BBO, T2
ZCOPYB_8
addic. T1, T1, -1
bgt ZGEMM_L1_COPYB8
ZGEMM_L1_COPYB1:
andi. T1, K, 7
ble ZGEMM_L1_COPYB_END
ZGEMM_L1_COPYB_LOOP:
ZCOPYB_1
addic. T1, T1, -1
bgt ZGEMM_L1_COPYB_LOOP
ZGEMM_L1_COPYB_END:
mr CO, C
mr AO, A
srawi. I, M, 3
ble ZGEMM_L1x8_END
ZGEMM_L1x8_BEGIN:
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 32x */
ZERO1x8
ble ZGEMM_L1x8_SUB0
ZGEMM_L1x8_LOOP_START:
LOAD1x8 0
li T2, 1024
li T3, 1024+512
li T4, 2048
li T5, 2048+512
mtctr L
MY_ALIGN
ZGEMM_L1x8_LOOP:
dcbt AO, PRE
dcbt BO, PRE
KERNEL1x8_L 128,32,0,0
KERNEL1x8_L 128,32,1,0
dcbt AO, T2
KERNEL1x8_L 128,32,2,0
KERNEL1x8_L 128,32,3,0
dcbt AO, T3
dcbt BO, T2
KERNEL1x8_L 128,32,4,0
KERNEL1x8_L 128,32,5,0
dcbt AO, T4
KERNEL1x8_L 128,32,6,0
KERNEL1x8_L 128,32,7,0
dcbt AO, T5
dcbt BO, T3
KERNEL1x8_L 128,32,8,0
KERNEL1x8_L 128,32,9,0
KERNEL1x8_L 128,32,10,0
KERNEL1x8_L 128,32,11,0
dcbt BO, T4
KERNEL1x8_L 128,32,12,0
KERNEL1x8_L 128,32,13,0
KERNEL1x8_L 128,32,14,0
KERNEL1x8_L 128,32,15,1
bdnz ZGEMM_L1x8_LOOP
MY_ALIGN
ZGEMM_L1x8_LOOP_END:
END1x8 AO, BO, 128, 32
b ZGEMM_L1x8_SUB1
ZGEMM_L1x8_SUB0:
andi. L, K, 63
b ZGEMM_L1x8_SUB2
ZGEMM_L1x8_SUB1:
andi. L, T1, 31
ble ZGEMM_L1x8_SAVE
ZGEMM_L1x8_SUB2:
srawi. T1,L, 3
ble ZGEMM_L1x8_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L1x8_SUB2_LOOP:
LOAD1x8 0
KERNEL1x8_L 128,32, 0,0
KERNEL1x8_L 128,32, 1,0
KERNEL1x8_L 128,32, 2,0
KERNEL1x8_E 128,32, 3,1
bdnz ZGEMM_L1x8_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x8_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x8_SUB2_2
LOAD1x8 0
KERNEL1x8_L 128,32, 0,0
KERNEL1x8_E 128,32, 1,1
MY_ALIGN
ZGEMM_L1x8_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x8_SUB2_1
LOAD1x8 0
KERNEL1x8_E 128,32, 0,1
MY_ALIGN
ZGEMM_L1x8_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L1x8_SAVE
KERNEL1x8
/* addic. L, L, -1
bgt ZGEMM_L1x8_SUB2_1*/
ZGEMM_L1x8_SAVE:
SAVE1x8
addic. I, I, -1
bgt ZGEMM_L1x8_BEGIN
ZGEMM_L1x8_END:
ZGEMM_L1x4_BEGIN:
andi. T2, M, 7
ble ZGEMM_L1x1_END
andi. T1, M, 4
ble ZGEMM_L1x4_END
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 16x */
ZERO1x4
ble ZGEMM_L1x4_SUB0
ZGEMM_L1x4_LOOP_START:
LOAD1x4 0
mtctr L
MY_ALIGN
ZGEMM_L1x4_LOOP:
KERNEL1x4_L 64,32,0,0
KERNEL1x4_L 64,32,1,0
KERNEL1x4_L 64,32,2,0
KERNEL1x4_L 64,32,3,0
KERNEL1x4_L 64,32,4,0
KERNEL1x4_L 64,32,5,0
KERNEL1x4_L 64,32,6,0
KERNEL1x4_L 64,32,7,0
KERNEL1x4_L 64,32,8,0
KERNEL1x4_L 64,32,9,0
KERNEL1x4_L 64,32,10,0
KERNEL1x4_L 64,32,11,0
KERNEL1x4_L 64,32,12,0
KERNEL1x4_L 64,32,13,0
KERNEL1x4_L 64,32,14,0
KERNEL1x4_L 64,32,15,1
bdnz ZGEMM_L1x4_LOOP
MY_ALIGN
ZGEMM_L1x4_LOOP_END:
END1x4 AO, BO, 64, 32
b ZGEMM_L1x4_SUB1
ZGEMM_L1x4_SUB0:
andi. L, K, 63
b ZGEMM_L1x4_SUB2
ZGEMM_L1x4_SUB1:
andi. L, T1, 31
ble ZGEMM_L1x4_SAVE
ZGEMM_L1x4_SUB2:
srawi. T1,L, 3
ble ZGEMM_L1x4_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L1x4_SUB2_LOOP:
LOAD1x4 0
KERNEL1x4_L 64,32, 0,0
KERNEL1x4_L 64,32, 1,0
KERNEL1x4_L 64,32, 2,0
KERNEL1x4_E 64,32, 3,1
bdnz ZGEMM_L1x4_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x4_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x4_SUB2_2
LOAD1x4 0
KERNEL1x4_L 64,32, 0,0
KERNEL1x4_E 64,32, 1,1
MY_ALIGN
ZGEMM_L1x4_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x4_SUB2_1
LOAD1x4 0
KERNEL1x4_E 64,32, 0,1
MY_ALIGN
ZGEMM_L1x4_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L1x4_SAVE
KERNEL1x4
ZGEMM_L1x4_SAVE:
SAVE1x4
ZGEMM_L1x4_END:
ZGEMM_L1x2_BEGIN:
andi. T1, M, 2
ble ZGEMM_L1x2_END
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 16x */
ZERO1x2
ble ZGEMM_L1x2_SUB0
ZGEMM_L1x2_LOOP_START:
LOAD1x2 0
mtctr L
MY_ALIGN
ZGEMM_L1x2_LOOP:
KERNEL1x2_L 32,32,0,0
KERNEL1x2_L 32,32,1,0
KERNEL1x2_L 32,32,2,0
KERNEL1x2_L 32,32,3,0
KERNEL1x2_L 32,32,4,0
KERNEL1x2_L 32,32,5,0
KERNEL1x2_L 32,32,6,0
KERNEL1x2_L 32,32,7,0
KERNEL1x2_L 32,32,8,0
KERNEL1x2_L 32,32,9,0
KERNEL1x2_L 32,32,10,0
KERNEL1x2_L 32,32,11,0
KERNEL1x2_L 32,32,12,0
KERNEL1x2_L 32,32,13,0
KERNEL1x2_L 32,32,14,0
KERNEL1x2_L 32,32,15,1
bdnz ZGEMM_L1x2_LOOP
MY_ALIGN
ZGEMM_L1x2_LOOP_END:
END1x2 AO, BO, 32, 32
b ZGEMM_L1x2_SUB1
ZGEMM_L1x2_SUB0:
andi. L, K, 63
b ZGEMM_L1x2_SUB2
ZGEMM_L1x2_SUB1:
andi. L, T1, 31
ble ZGEMM_L1x2_SAVE
ZGEMM_L1x2_SUB2:
srawi. T1,L, 3
ble ZGEMM_L1x2_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L1x2_SUB2_LOOP:
LOAD1x2 0
KERNEL1x2_L 32,32, 0,0
KERNEL1x2_L 32,32, 1,0
KERNEL1x2_L 32,32, 2,0
KERNEL1x2_E 32,32, 3,1
bdnz ZGEMM_L1x2_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x2_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x2_SUB2_2
LOAD1x2 0
KERNEL1x2_L 32,32, 0,0
KERNEL1x2_E 32,32, 1,1
MY_ALIGN
ZGEMM_L1x2_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x2_SUB2_1
LOAD1x2 0
KERNEL1x2_E 32,32, 0,1
MY_ALIGN
ZGEMM_L1x2_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L1x2_SAVE
KERNEL1x2
ZGEMM_L1x2_SAVE:
SAVE1x2
ZGEMM_L1x2_END:
ZGEMM_L1x1_BEGIN:
andi. T1, M, 1
ble ZGEMM_L1x1_END
mr BO, BBUFFER
mr T1, K
addi T1,T1, -1
srawi. L, T1, 5 /**(K-1) % 16x */
ZERO1x1
ble ZGEMM_L1x1_SUB0
ZGEMM_L1x1_LOOP_START:
LOAD1x1 0
mtctr L
MY_ALIGN
ZGEMM_L1x1_LOOP:
KERNEL1x1_L 16,32,0,0
KERNEL1x1_L 16,32,1,0
KERNEL1x1_L 16,32,2,0
KERNEL1x1_L 16,32,3,0
KERNEL1x1_L 16,32,4,0
KERNEL1x1_L 16,32,5,0
KERNEL1x1_L 16,32,6,0
KERNEL1x1_L 16,32,7,0
KERNEL1x1_L 16,32,8,0
KERNEL1x1_L 16,32,9,0
KERNEL1x1_L 16,32,10,0
KERNEL1x1_L 16,32,11,0
KERNEL1x1_L 16,32,12,0
KERNEL1x1_L 16,32,13,0
KERNEL1x1_L 16,32,14,0
KERNEL1x1_L 16,32,15,1
bdnz ZGEMM_L1x1_LOOP
MY_ALIGN
ZGEMM_L1x1_LOOP_END:
END1x1 AO, BO, 16, 32
b ZGEMM_L1x1_SUB1
ZGEMM_L1x1_SUB0:
andi. L, K, 63
b ZGEMM_L1x1_SUB2
ZGEMM_L1x1_SUB1:
andi. L, T1, 31
ble ZGEMM_L1x1_SAVE
ZGEMM_L1x1_SUB2:
srawi. T1,L, 3
ble ZGEMM_L1x1_SUB2_4
mtctr T1
MY_ALIGN
ZGEMM_L1x1_SUB2_LOOP:
LOAD1x1 0
KERNEL1x1_L 16,32, 0,0
KERNEL1x1_L 16,32, 1,0
KERNEL1x1_L 16,32, 2,0
KERNEL1x1_E 16,32, 3,1
bdnz ZGEMM_L1x1_SUB2_LOOP
MY_ALIGN
ZGEMM_L1x1_SUB2_4:
andi. T1,L, 4
ble ZGEMM_L1x1_SUB2_2
LOAD1x1 0
KERNEL1x1_L 16,32, 0,0
KERNEL1x1_E 16,32, 1,1
MY_ALIGN
ZGEMM_L1x1_SUB2_2:
andi. T1,L, 2
ble ZGEMM_L1x1_SUB2_1
LOAD1x1 0
KERNEL1x1_E 16,32, 0,1
MY_ALIGN
ZGEMM_L1x1_SUB2_1:
andi. T1,L, 1
ble ZGEMM_L1x1_SAVE
KERNEL1x1
ZGEMM_L1x1_SAVE:
SAVE1x1
ZGEMM_L1x1_END:
ZGEMM_L1_END:

File diff suppressed because it is too large Load Diff

View File

@ -2251,12 +2251,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P 640
#define DGEMM_DEFAULT_P 128
#define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 320
#define ZGEMM_DEFAULT_P 512
#define SGEMM_DEFAULT_Q 1408
#define DGEMM_DEFAULT_Q 384
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 1152
#define SYMV_P 8