improved zgemm power9 based on power8
This commit is contained in:
parent
47f892198c
commit
8fe794f059
|
@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
|||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMKERNEL = zgemm_kernel_power9.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
|
|
|
@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
/*alpha is stored in f1. convert to single and splat*/
|
||||
xscvdpspn alpha_r,vs1
|
||||
xscvdpspn alpha_r,vs1
|
||||
xxspltw alpha_r,alpha_r,0
|
||||
|
||||
|
||||
|
|
|
@ -53,9 +53,9 @@ LSGEMM_L8x16_BEGIN:
|
|||
LSGEMM_L8x16_LOOP_START:
|
||||
|
||||
LOAD8x16_0 /*we already zeroed */
|
||||
##OffsetA=64 OffsetB=32
|
||||
addi AO,AO,2112
|
||||
addi BO,BO,32
|
||||
/*##OffsetA=64 OffsetB=32
|
||||
#addi AO,AO,2112
|
||||
#addi BO,BO,32 */
|
||||
|
||||
mtctr L
|
||||
|
||||
|
@ -63,29 +63,29 @@ LSGEMM_L8x16_LOOP_START:
|
|||
|
||||
LSGEMM_L8x16_LOOP:
|
||||
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 0,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 1,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 2,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 3,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 4,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 5,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 6,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 7,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 8,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 9,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 10,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 11,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 12,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 13,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 14,0
|
||||
KERNEL8x16_I1_L4_2 -2048,0, 15,1
|
||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 3,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 4,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 5,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 6,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 7,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 8,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 9,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 10,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 11,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 12,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 13,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 14,0
|
||||
KERNEL8x16_I1_L4_2 64,32, 15,1
|
||||
|
||||
bdnz LSGEMM_L8x16_LOOP
|
||||
|
||||
MY_ALIGN
|
||||
LSGEMM_L8x16_LOOP_END:
|
||||
|
||||
END8x16 0, AO, BO, -2048, 0
|
||||
END8x16 0, AO, BO, 64, 32
|
||||
|
||||
b LSGEMM_L8x16_SUB1
|
||||
MY_ALIGN
|
||||
|
|
|
@ -0,0 +1,257 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
#define LOAD ld
|
||||
|
||||
#define STACKSIZE 32192
|
||||
|
||||
#define FZERO 312+192(SP)
|
||||
|
||||
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
|
||||
|
||||
|
||||
#define o0 0
|
||||
#define alpha_r vs30
|
||||
#define alpha_i vs31
|
||||
|
||||
#define VECSAVE r11
|
||||
|
||||
#define FRAMEPOINTER r12
|
||||
|
||||
#define BBUFFER r14
|
||||
|
||||
#define L r15
|
||||
#define ALPHA r16
|
||||
#define T5 r17
|
||||
#define T2 r19
|
||||
#define BBO r20
|
||||
#define o8 r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define o16 r27
|
||||
#define T3 r28
|
||||
#define T4 r29
|
||||
|
||||
#define PRE r30
|
||||
#define T1 r31
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mr FRAMEPOINTER, SP
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
|
||||
stxv v20, 288(SP)
|
||||
stxv v21, 304(SP)
|
||||
stxv v22, 320(SP)
|
||||
stxv v23, 336(SP)
|
||||
stxv v24, 352(SP)
|
||||
stxv v25, 368(SP)
|
||||
stxv v26, 384(SP)
|
||||
stxv v27, 400(SP)
|
||||
stxv v28, 416(SP)
|
||||
stxv v29, 432(SP)
|
||||
stxv v30, 448(SP)
|
||||
stxv v31, 464(SP)
|
||||
|
||||
|
||||
stw r0, FZERO
|
||||
|
||||
#ifdef linux
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#include "zgemm_macros_power9.S"
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble L999
|
||||
cmpwi cr0, N, 0
|
||||
ble L999
|
||||
cmpwi cr0, K, 0
|
||||
ble L999
|
||||
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
li PRE, 512
|
||||
li o8 , 8
|
||||
li o16 , 16
|
||||
|
||||
addi BBUFFER, SP, 512+4096
|
||||
li T1, -4096
|
||||
and BBUFFER, BBUFFER, T1
|
||||
|
||||
|
||||
addi ALPHA, SP, 296+192
|
||||
|
||||
xxlor alpha_r,vs1,vs1 /*copy from register f1 */
|
||||
xxlor alpha_i,vs2,vs2 /*copy from register f2 */
|
||||
|
||||
.align 4
|
||||
|
||||
#include "zgemm_logic_power9.S"
|
||||
|
||||
L999:
|
||||
addi r3, 0, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
lxv v20, 288(SP)
|
||||
lxv v21, 304(SP)
|
||||
lxv v22, 320(SP)
|
||||
lxv v23, 336(SP)
|
||||
lxv v24, 352(SP)
|
||||
lxv v25, 368(SP)
|
||||
lxv v26, 384(SP)
|
||||
lxv v27, 400(SP)
|
||||
lxv v28, 416(SP)
|
||||
lxv v29, 432(SP)
|
||||
lxv v30, 448(SP)
|
||||
lxv v31, 464(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
|
@ -0,0 +1,857 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define MY_ALIGN .align 3
|
||||
|
||||
srawi. J, N, 1
|
||||
ble ZGEMM_L2_END
|
||||
|
||||
ZGEMM_L2_BEGIN:
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
srawi. T1, K, 2
|
||||
ble ZGEMM_L2_COPYB1
|
||||
|
||||
ZGEMM_L2_COPYB8:
|
||||
|
||||
addi T2, PRE, 128
|
||||
dcbt BO, PRE
|
||||
dcbtst BBO, PRE
|
||||
dcbtst BBO, T2
|
||||
ZCOPYB_8
|
||||
addic. T1, T1, -1
|
||||
|
||||
bgt ZGEMM_L2_COPYB8
|
||||
|
||||
ZGEMM_L2_COPYB1:
|
||||
|
||||
andi. T1, K, 3
|
||||
ble ZGEMM_L2_COPYB_END
|
||||
|
||||
ZGEMM_L2_COPYB_LOOP:
|
||||
|
||||
ZCOPYB_2
|
||||
addic. T1, T1, -1
|
||||
|
||||
bgt ZGEMM_L2_COPYB_LOOP
|
||||
|
||||
ZGEMM_L2_COPYB_END:
|
||||
|
||||
mr CO, C
|
||||
mr AO, A
|
||||
slwi T1, LDC , 1
|
||||
add C, C, T1
|
||||
srawi. I, M, 3
|
||||
ble ZGEMM_L2x8_END
|
||||
|
||||
ZGEMM_L2x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 5 /**(K-1) % 32x */
|
||||
ZERO2x8
|
||||
ble ZGEMM_L2x8_SUB0
|
||||
|
||||
|
||||
ZGEMM_L2x8_LOOP_START:
|
||||
|
||||
LOAD2x8 0
|
||||
li T2, 1024
|
||||
li T3, 1024+512
|
||||
li T4, 2048
|
||||
li T5, 2048+512
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x8_LOOP:
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL2x8_L 128,64,0,0
|
||||
KERNEL2x8_L 128,64,1,0
|
||||
dcbt AO, T2
|
||||
KERNEL2x8_L 128,64,2,0
|
||||
KERNEL2x8_L 128,64,3,0
|
||||
dcbt AO, T3
|
||||
dcbt BO, T2
|
||||
KERNEL2x8_L 128,64,4,0
|
||||
KERNEL2x8_L 128,64,5,0
|
||||
dcbt AO, T4
|
||||
KERNEL2x8_L 128,64,6,0
|
||||
KERNEL2x8_L 128,64,7,0
|
||||
dcbt AO, T5
|
||||
dcbt BO, T3
|
||||
KERNEL2x8_L 128,64,8,0
|
||||
KERNEL2x8_L 128,64,9,0
|
||||
KERNEL2x8_L 128,64,10,0
|
||||
KERNEL2x8_L 128,64,11,0
|
||||
dcbt BO, T4
|
||||
KERNEL2x8_L 128,64,12,0
|
||||
KERNEL2x8_L 128,64,13,0
|
||||
KERNEL2x8_L 128,64,14,0
|
||||
KERNEL2x8_L 128,64,15,1
|
||||
bdnz ZGEMM_L2x8_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x8_LOOP_END:
|
||||
END2x8 AO, BO, 128, 64
|
||||
|
||||
b ZGEMM_L2x8_SUB1
|
||||
|
||||
ZGEMM_L2x8_SUB0:
|
||||
|
||||
andi. L, K, 63
|
||||
|
||||
b ZGEMM_L2x8_SUB2
|
||||
|
||||
ZGEMM_L2x8_SUB1:
|
||||
|
||||
andi. L, T1, 31
|
||||
ble ZGEMM_L2x8_SAVE
|
||||
|
||||
ZGEMM_L2x8_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L2x8_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x8_SUB2_LOOP:
|
||||
LOAD2x8 0
|
||||
KERNEL2x8_L 128,64, 0,0
|
||||
KERNEL2x8_L 128,64, 1,0
|
||||
KERNEL2x8_L 128,64, 2,0
|
||||
KERNEL2x8_E 128,64, 3,1
|
||||
bdnz ZGEMM_L2x8_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x8_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L2x8_SUB2_2
|
||||
LOAD2x8 0
|
||||
KERNEL2x8_L 128,64, 0,0
|
||||
KERNEL2x8_E 128,64, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x8_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L2x8_SUB2_1
|
||||
LOAD2x8 0
|
||||
KERNEL2x8_E 128,64, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x8_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L2x8_SAVE
|
||||
KERNEL2x8
|
||||
|
||||
/* addic. L, L, -1
|
||||
bgt ZGEMM_L2x8_SUB2_1*/
|
||||
|
||||
ZGEMM_L2x8_SAVE:
|
||||
|
||||
SAVE2x8
|
||||
|
||||
addic. I, I, -1
|
||||
bgt ZGEMM_L2x8_BEGIN
|
||||
|
||||
ZGEMM_L2x8_END:
|
||||
|
||||
ZGEMM_L2x4_BEGIN:
|
||||
|
||||
andi. T2, M, 7
|
||||
ble ZGEMM_L2x1_END
|
||||
|
||||
andi. T1, M, 4
|
||||
ble ZGEMM_L2x4_END
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 4 /**(K-1) % 16x */
|
||||
ZERO2x4
|
||||
ble ZGEMM_L2x4_SUB0
|
||||
|
||||
ZGEMM_L2x4_LOOP_START:
|
||||
LOAD2x4 0
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x4_LOOP:
|
||||
KERNEL2x4_L 64,64,0,0
|
||||
KERNEL2x4_L 64,64,1,0
|
||||
KERNEL2x4_L 64,64,2,0
|
||||
KERNEL2x4_L 64,64,3,0
|
||||
KERNEL2x4_L 64,64,4,0
|
||||
KERNEL2x4_L 64,64,5,0
|
||||
KERNEL2x4_L 64,64,6,0
|
||||
KERNEL2x4_L 64,64,7,1
|
||||
bdnz ZGEMM_L2x4_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x4_LOOP_END:
|
||||
END2x4 AO, BO, 64, 64
|
||||
|
||||
b ZGEMM_L2x4_SUB1
|
||||
|
||||
ZGEMM_L2x4_SUB0:
|
||||
|
||||
andi. L, K, 31
|
||||
|
||||
b ZGEMM_L2x4_SUB2
|
||||
|
||||
ZGEMM_L2x4_SUB1:
|
||||
|
||||
andi. L, T1, 15
|
||||
ble ZGEMM_L2x4_SAVE
|
||||
|
||||
ZGEMM_L2x4_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L2x4_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x4_SUB2_LOOP:
|
||||
LOAD2x4 0
|
||||
KERNEL2x4_L 64,64, 0,0
|
||||
KERNEL2x4_L 64,64, 1,0
|
||||
KERNEL2x4_L 64,64, 2,0
|
||||
KERNEL2x4_E 64,64, 3,1
|
||||
bdnz ZGEMM_L2x4_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x4_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L2x4_SUB2_2
|
||||
LOAD2x4 0
|
||||
KERNEL2x4_L 64,64, 0,0
|
||||
KERNEL2x4_E 64,64, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x4_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L2x4_SUB2_1
|
||||
LOAD2x4 0
|
||||
KERNEL2x4_E 64,64, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x4_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L2x4_SAVE
|
||||
KERNEL2x4
|
||||
|
||||
ZGEMM_L2x4_SAVE:
|
||||
|
||||
SAVE2x4
|
||||
|
||||
ZGEMM_L2x4_END:
|
||||
|
||||
ZGEMM_L2x2_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 2
|
||||
ble ZGEMM_L2x2_END
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 4 /**(K-1) % 16x */
|
||||
ZERO2x2
|
||||
ble ZGEMM_L2x2_SUB0
|
||||
|
||||
ZGEMM_L2x2_LOOP_START:
|
||||
LOAD2x2 0
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x2_LOOP:
|
||||
KERNEL2x2_L 32,64,0,0
|
||||
KERNEL2x2_L 32,64,1,0
|
||||
KERNEL2x2_L 32,64,2,0
|
||||
KERNEL2x2_L 32,64,3,0
|
||||
KERNEL2x2_L 32,64,4,0
|
||||
KERNEL2x2_L 32,64,5,0
|
||||
KERNEL2x2_L 32,64,6,0
|
||||
KERNEL2x2_L 32,64,7,1
|
||||
bdnz ZGEMM_L2x2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x2_LOOP_END:
|
||||
END2x2 AO, BO, 32, 64
|
||||
|
||||
b ZGEMM_L2x2_SUB1
|
||||
|
||||
ZGEMM_L2x2_SUB0:
|
||||
|
||||
andi. L, K, 31
|
||||
|
||||
b ZGEMM_L2x2_SUB2
|
||||
|
||||
ZGEMM_L2x2_SUB1:
|
||||
|
||||
andi. L, T1, 15
|
||||
ble ZGEMM_L2x2_SAVE
|
||||
|
||||
ZGEMM_L2x2_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L2x2_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x2_SUB2_LOOP:
|
||||
LOAD2x2 0
|
||||
KERNEL2x2_L 32,64, 0,0
|
||||
KERNEL2x2_L 32,64, 1,0
|
||||
KERNEL2x2_L 32,64, 2,0
|
||||
KERNEL2x2_E 32,64, 3,1
|
||||
bdnz ZGEMM_L2x2_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x2_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L2x2_SUB2_2
|
||||
LOAD2x2 0
|
||||
KERNEL2x2_L 32,64, 0,0
|
||||
KERNEL2x2_E 32,64, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x2_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L2x2_SUB2_1
|
||||
LOAD2x2 0
|
||||
KERNEL2x2_E 32,64, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x2_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L2x2_SAVE
|
||||
KERNEL2x2
|
||||
ZGEMM_L2x2_SAVE:
|
||||
|
||||
SAVE2x2
|
||||
|
||||
ZGEMM_L2x2_END:
|
||||
|
||||
ZGEMM_L2x1_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 1
|
||||
ble ZGEMM_L2x1_END
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 4 /**(K-1) % 16x */
|
||||
ZERO2x1
|
||||
ble ZGEMM_L2x1_SUB0
|
||||
|
||||
ZGEMM_L2x1_LOOP_START:
|
||||
|
||||
LOAD2x1 0
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x1_LOOP:
|
||||
KERNEL2x1_L 16,64,0,0
|
||||
KERNEL2x1_L 16,64,1,0
|
||||
KERNEL2x1_L 16,64,2,0
|
||||
KERNEL2x1_L 16,64,3,0
|
||||
KERNEL2x1_L 16,64,4,0
|
||||
KERNEL2x1_L 16,64,5,0
|
||||
KERNEL2x1_L 16,64,6,0
|
||||
KERNEL2x1_L 16,64,7,1
|
||||
bdnz ZGEMM_L2x1_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x1_LOOP_END:
|
||||
END2x1 AO, BO, 16, 64
|
||||
|
||||
b ZGEMM_L2x1_SUB1
|
||||
|
||||
ZGEMM_L2x1_SUB0:
|
||||
|
||||
andi. L, K, 31
|
||||
|
||||
b ZGEMM_L2x1_SUB2
|
||||
|
||||
ZGEMM_L2x1_SUB1:
|
||||
|
||||
andi. L, T1, 15
|
||||
ble ZGEMM_L2x1_SAVE
|
||||
|
||||
ZGEMM_L2x1_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L2x1_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x1_SUB2_LOOP:
|
||||
LOAD2x1 0
|
||||
KERNEL2x1_L 16,64, 0,0
|
||||
KERNEL2x1_L 16,64, 1,0
|
||||
KERNEL2x1_L 16,64, 2,0
|
||||
KERNEL2x1_E 16,64, 3,1
|
||||
bdnz ZGEMM_L2x1_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x1_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L2x1_SUB2_2
|
||||
LOAD2x1 0
|
||||
KERNEL2x1_L 16,64, 0,0
|
||||
KERNEL2x1_E 16,64, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x1_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L2x1_SUB2_1
|
||||
LOAD2x1 0
|
||||
KERNEL2x1_E 16,64, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L2x1_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L2x1_SAVE
|
||||
KERNEL2x1
|
||||
|
||||
ZGEMM_L2x1_SAVE:
|
||||
|
||||
SAVE2x1
|
||||
|
||||
ZGEMM_L2x1_END:
|
||||
|
||||
slwi T1, K, 5
|
||||
add B, B, T1
|
||||
|
||||
addic. J, J, -1
|
||||
bgt ZGEMM_L2_BEGIN
|
||||
|
||||
andi. T2, N, 1
|
||||
ble L999
|
||||
|
||||
ZGEMM_L2_END:
|
||||
|
||||
b ZGEMM_L1_BEGIN
|
||||
|
||||
L999_H1:
|
||||
|
||||
b L999
|
||||
|
||||
ZGEMM_L1_BEGIN:
|
||||
andi. T1, N, 1
|
||||
ble ZGEMM_L1_END
|
||||
|
||||
mr BO, B
|
||||
mr BBO, BBUFFER
|
||||
srawi. T1, K, 3 /*this time K/8 */
|
||||
ble ZGEMM_L1_COPYB1
|
||||
|
||||
ZGEMM_L1_COPYB8:
|
||||
|
||||
addi T2, PRE, 128
|
||||
dcbt BO, PRE
|
||||
dcbtst BBO, PRE
|
||||
dcbtst BBO, T2
|
||||
ZCOPYB_8
|
||||
addic. T1, T1, -1
|
||||
|
||||
bgt ZGEMM_L1_COPYB8
|
||||
|
||||
ZGEMM_L1_COPYB1:
|
||||
|
||||
andi. T1, K, 7
|
||||
ble ZGEMM_L1_COPYB_END
|
||||
|
||||
ZGEMM_L1_COPYB_LOOP:
|
||||
|
||||
ZCOPYB_1
|
||||
addic. T1, T1, -1
|
||||
|
||||
bgt ZGEMM_L1_COPYB_LOOP
|
||||
|
||||
ZGEMM_L1_COPYB_END:
|
||||
|
||||
mr CO, C
|
||||
mr AO, A
|
||||
srawi. I, M, 3
|
||||
ble ZGEMM_L1x8_END
|
||||
|
||||
ZGEMM_L1x8_BEGIN:
|
||||
|
||||
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 5 /**(K-1) % 32x */
|
||||
ZERO1x8
|
||||
ble ZGEMM_L1x8_SUB0
|
||||
|
||||
|
||||
ZGEMM_L1x8_LOOP_START:
|
||||
|
||||
LOAD1x8 0
|
||||
li T2, 1024
|
||||
li T3, 1024+512
|
||||
li T4, 2048
|
||||
li T5, 2048+512
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x8_LOOP:
|
||||
dcbt AO, PRE
|
||||
dcbt BO, PRE
|
||||
KERNEL1x8_L 128,32,0,0
|
||||
KERNEL1x8_L 128,32,1,0
|
||||
dcbt AO, T2
|
||||
KERNEL1x8_L 128,32,2,0
|
||||
KERNEL1x8_L 128,32,3,0
|
||||
dcbt AO, T3
|
||||
dcbt BO, T2
|
||||
KERNEL1x8_L 128,32,4,0
|
||||
KERNEL1x8_L 128,32,5,0
|
||||
dcbt AO, T4
|
||||
KERNEL1x8_L 128,32,6,0
|
||||
KERNEL1x8_L 128,32,7,0
|
||||
dcbt AO, T5
|
||||
dcbt BO, T3
|
||||
KERNEL1x8_L 128,32,8,0
|
||||
KERNEL1x8_L 128,32,9,0
|
||||
KERNEL1x8_L 128,32,10,0
|
||||
KERNEL1x8_L 128,32,11,0
|
||||
dcbt BO, T4
|
||||
KERNEL1x8_L 128,32,12,0
|
||||
KERNEL1x8_L 128,32,13,0
|
||||
KERNEL1x8_L 128,32,14,0
|
||||
KERNEL1x8_L 128,32,15,1
|
||||
bdnz ZGEMM_L1x8_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x8_LOOP_END:
|
||||
END1x8 AO, BO, 128, 32
|
||||
|
||||
b ZGEMM_L1x8_SUB1
|
||||
|
||||
ZGEMM_L1x8_SUB0:
|
||||
|
||||
andi. L, K, 63
|
||||
|
||||
b ZGEMM_L1x8_SUB2
|
||||
|
||||
ZGEMM_L1x8_SUB1:
|
||||
|
||||
andi. L, T1, 31
|
||||
ble ZGEMM_L1x8_SAVE
|
||||
|
||||
ZGEMM_L1x8_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L1x8_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x8_SUB2_LOOP:
|
||||
LOAD1x8 0
|
||||
KERNEL1x8_L 128,32, 0,0
|
||||
KERNEL1x8_L 128,32, 1,0
|
||||
KERNEL1x8_L 128,32, 2,0
|
||||
KERNEL1x8_E 128,32, 3,1
|
||||
bdnz ZGEMM_L1x8_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x8_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L1x8_SUB2_2
|
||||
LOAD1x8 0
|
||||
KERNEL1x8_L 128,32, 0,0
|
||||
KERNEL1x8_E 128,32, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x8_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L1x8_SUB2_1
|
||||
LOAD1x8 0
|
||||
KERNEL1x8_E 128,32, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x8_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L1x8_SAVE
|
||||
KERNEL1x8
|
||||
|
||||
/* addic. L, L, -1
|
||||
bgt ZGEMM_L1x8_SUB2_1*/
|
||||
|
||||
ZGEMM_L1x8_SAVE:
|
||||
|
||||
SAVE1x8
|
||||
|
||||
addic. I, I, -1
|
||||
bgt ZGEMM_L1x8_BEGIN
|
||||
|
||||
ZGEMM_L1x8_END:
|
||||
|
||||
ZGEMM_L1x4_BEGIN:
|
||||
|
||||
andi. T2, M, 7
|
||||
ble ZGEMM_L1x1_END
|
||||
|
||||
andi. T1, M, 4
|
||||
ble ZGEMM_L1x4_END
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 5 /**(K-1) % 16x */
|
||||
ZERO1x4
|
||||
ble ZGEMM_L1x4_SUB0
|
||||
|
||||
ZGEMM_L1x4_LOOP_START:
|
||||
LOAD1x4 0
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x4_LOOP:
|
||||
KERNEL1x4_L 64,32,0,0
|
||||
KERNEL1x4_L 64,32,1,0
|
||||
KERNEL1x4_L 64,32,2,0
|
||||
KERNEL1x4_L 64,32,3,0
|
||||
KERNEL1x4_L 64,32,4,0
|
||||
KERNEL1x4_L 64,32,5,0
|
||||
KERNEL1x4_L 64,32,6,0
|
||||
KERNEL1x4_L 64,32,7,0
|
||||
KERNEL1x4_L 64,32,8,0
|
||||
KERNEL1x4_L 64,32,9,0
|
||||
KERNEL1x4_L 64,32,10,0
|
||||
KERNEL1x4_L 64,32,11,0
|
||||
KERNEL1x4_L 64,32,12,0
|
||||
KERNEL1x4_L 64,32,13,0
|
||||
KERNEL1x4_L 64,32,14,0
|
||||
KERNEL1x4_L 64,32,15,1
|
||||
bdnz ZGEMM_L1x4_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x4_LOOP_END:
|
||||
END1x4 AO, BO, 64, 32
|
||||
|
||||
b ZGEMM_L1x4_SUB1
|
||||
|
||||
ZGEMM_L1x4_SUB0:
|
||||
|
||||
andi. L, K, 63
|
||||
|
||||
b ZGEMM_L1x4_SUB2
|
||||
|
||||
ZGEMM_L1x4_SUB1:
|
||||
|
||||
andi. L, T1, 31
|
||||
ble ZGEMM_L1x4_SAVE
|
||||
|
||||
ZGEMM_L1x4_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L1x4_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x4_SUB2_LOOP:
|
||||
LOAD1x4 0
|
||||
KERNEL1x4_L 64,32, 0,0
|
||||
KERNEL1x4_L 64,32, 1,0
|
||||
KERNEL1x4_L 64,32, 2,0
|
||||
KERNEL1x4_E 64,32, 3,1
|
||||
bdnz ZGEMM_L1x4_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x4_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L1x4_SUB2_2
|
||||
LOAD1x4 0
|
||||
KERNEL1x4_L 64,32, 0,0
|
||||
KERNEL1x4_E 64,32, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x4_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L1x4_SUB2_1
|
||||
LOAD1x4 0
|
||||
KERNEL1x4_E 64,32, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x4_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L1x4_SAVE
|
||||
KERNEL1x4
|
||||
|
||||
ZGEMM_L1x4_SAVE:
|
||||
|
||||
SAVE1x4
|
||||
|
||||
ZGEMM_L1x4_END:
|
||||
|
||||
ZGEMM_L1x2_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 2
|
||||
ble ZGEMM_L1x2_END
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 5 /**(K-1) % 16x */
|
||||
ZERO1x2
|
||||
ble ZGEMM_L1x2_SUB0
|
||||
|
||||
ZGEMM_L1x2_LOOP_START:
|
||||
LOAD1x2 0
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x2_LOOP:
|
||||
KERNEL1x2_L 32,32,0,0
|
||||
KERNEL1x2_L 32,32,1,0
|
||||
KERNEL1x2_L 32,32,2,0
|
||||
KERNEL1x2_L 32,32,3,0
|
||||
KERNEL1x2_L 32,32,4,0
|
||||
KERNEL1x2_L 32,32,5,0
|
||||
KERNEL1x2_L 32,32,6,0
|
||||
KERNEL1x2_L 32,32,7,0
|
||||
KERNEL1x2_L 32,32,8,0
|
||||
KERNEL1x2_L 32,32,9,0
|
||||
KERNEL1x2_L 32,32,10,0
|
||||
KERNEL1x2_L 32,32,11,0
|
||||
KERNEL1x2_L 32,32,12,0
|
||||
KERNEL1x2_L 32,32,13,0
|
||||
KERNEL1x2_L 32,32,14,0
|
||||
KERNEL1x2_L 32,32,15,1
|
||||
bdnz ZGEMM_L1x2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x2_LOOP_END:
|
||||
END1x2 AO, BO, 32, 32
|
||||
|
||||
b ZGEMM_L1x2_SUB1
|
||||
|
||||
ZGEMM_L1x2_SUB0:
|
||||
|
||||
andi. L, K, 63
|
||||
|
||||
b ZGEMM_L1x2_SUB2
|
||||
|
||||
ZGEMM_L1x2_SUB1:
|
||||
|
||||
andi. L, T1, 31
|
||||
ble ZGEMM_L1x2_SAVE
|
||||
|
||||
ZGEMM_L1x2_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L1x2_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x2_SUB2_LOOP:
|
||||
LOAD1x2 0
|
||||
KERNEL1x2_L 32,32, 0,0
|
||||
KERNEL1x2_L 32,32, 1,0
|
||||
KERNEL1x2_L 32,32, 2,0
|
||||
KERNEL1x2_E 32,32, 3,1
|
||||
bdnz ZGEMM_L1x2_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x2_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L1x2_SUB2_2
|
||||
LOAD1x2 0
|
||||
KERNEL1x2_L 32,32, 0,0
|
||||
KERNEL1x2_E 32,32, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x2_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L1x2_SUB2_1
|
||||
LOAD1x2 0
|
||||
KERNEL1x2_E 32,32, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x2_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L1x2_SAVE
|
||||
KERNEL1x2
|
||||
ZGEMM_L1x2_SAVE:
|
||||
|
||||
SAVE1x2
|
||||
|
||||
ZGEMM_L1x2_END:
|
||||
|
||||
ZGEMM_L1x1_BEGIN:
|
||||
|
||||
|
||||
andi. T1, M, 1
|
||||
ble ZGEMM_L1x1_END
|
||||
mr BO, BBUFFER
|
||||
mr T1, K
|
||||
addi T1,T1, -1
|
||||
srawi. L, T1, 5 /**(K-1) % 16x */
|
||||
ZERO1x1
|
||||
ble ZGEMM_L1x1_SUB0
|
||||
|
||||
ZGEMM_L1x1_LOOP_START:
|
||||
|
||||
LOAD1x1 0
|
||||
mtctr L
|
||||
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x1_LOOP:
|
||||
KERNEL1x1_L 16,32,0,0
|
||||
KERNEL1x1_L 16,32,1,0
|
||||
KERNEL1x1_L 16,32,2,0
|
||||
KERNEL1x1_L 16,32,3,0
|
||||
KERNEL1x1_L 16,32,4,0
|
||||
KERNEL1x1_L 16,32,5,0
|
||||
KERNEL1x1_L 16,32,6,0
|
||||
KERNEL1x1_L 16,32,7,0
|
||||
KERNEL1x1_L 16,32,8,0
|
||||
KERNEL1x1_L 16,32,9,0
|
||||
KERNEL1x1_L 16,32,10,0
|
||||
KERNEL1x1_L 16,32,11,0
|
||||
KERNEL1x1_L 16,32,12,0
|
||||
KERNEL1x1_L 16,32,13,0
|
||||
KERNEL1x1_L 16,32,14,0
|
||||
KERNEL1x1_L 16,32,15,1
|
||||
bdnz ZGEMM_L1x1_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x1_LOOP_END:
|
||||
END1x1 AO, BO, 16, 32
|
||||
|
||||
b ZGEMM_L1x1_SUB1
|
||||
|
||||
ZGEMM_L1x1_SUB0:
|
||||
|
||||
andi. L, K, 63
|
||||
|
||||
b ZGEMM_L1x1_SUB2
|
||||
|
||||
ZGEMM_L1x1_SUB1:
|
||||
|
||||
andi. L, T1, 31
|
||||
ble ZGEMM_L1x1_SAVE
|
||||
|
||||
ZGEMM_L1x1_SUB2:
|
||||
srawi. T1,L, 3
|
||||
ble ZGEMM_L1x1_SUB2_4
|
||||
mtctr T1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x1_SUB2_LOOP:
|
||||
LOAD1x1 0
|
||||
KERNEL1x1_L 16,32, 0,0
|
||||
KERNEL1x1_L 16,32, 1,0
|
||||
KERNEL1x1_L 16,32, 2,0
|
||||
KERNEL1x1_E 16,32, 3,1
|
||||
bdnz ZGEMM_L1x1_SUB2_LOOP
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x1_SUB2_4:
|
||||
andi. T1,L, 4
|
||||
ble ZGEMM_L1x1_SUB2_2
|
||||
LOAD1x1 0
|
||||
KERNEL1x1_L 16,32, 0,0
|
||||
KERNEL1x1_E 16,32, 1,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x1_SUB2_2:
|
||||
andi. T1,L, 2
|
||||
ble ZGEMM_L1x1_SUB2_1
|
||||
LOAD1x1 0
|
||||
KERNEL1x1_E 16,32, 0,1
|
||||
MY_ALIGN
|
||||
ZGEMM_L1x1_SUB2_1:
|
||||
andi. T1,L, 1
|
||||
ble ZGEMM_L1x1_SAVE
|
||||
KERNEL1x1
|
||||
|
||||
ZGEMM_L1x1_SAVE:
|
||||
|
||||
SAVE1x1
|
||||
|
||||
ZGEMM_L1x1_END:
|
||||
|
||||
ZGEMM_L1_END:
|
File diff suppressed because it is too large
Load Diff
4
param.h
4
param.h
|
@ -2251,12 +2251,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SGEMM_DEFAULT_P 640
|
||||
#define DGEMM_DEFAULT_P 128
|
||||
#define CGEMM_DEFAULT_P 640
|
||||
#define ZGEMM_DEFAULT_P 320
|
||||
#define ZGEMM_DEFAULT_P 512
|
||||
|
||||
#define SGEMM_DEFAULT_Q 1408
|
||||
#define DGEMM_DEFAULT_Q 384
|
||||
#define CGEMM_DEFAULT_Q 640
|
||||
#define ZGEMM_DEFAULT_Q 640
|
||||
#define ZGEMM_DEFAULT_Q 1152
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
|
Loading…
Reference in New Issue