This patch introduces new optimized version of SGEMM, CGEMM and DGEMM using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. Cycles count reduced by 30-50% compared to POWER9 version depending on M/N/K sizes. MMA GCC patch for reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8ee2640bfdc62f835ec9740278f948034bc7d9f1
2815 lines
62 KiB
ArmAsm
2815 lines
62 KiB
ArmAsm
/***************************************************************************
|
|
Copyright (c) 2013-2020, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*****************************************************************************/
|
|
|
|
#define MY_ALIGN .align 3
|
|
b CGEMM_L4
|
|
/* MINI SUBROUTINES */
|
|
/* 4x8 MAIN 128x+2 LOOP */
|
|
|
|
|
|
CGEMM_L4x8_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD4x8_2
|
|
MY_ALIGN
|
|
CGEMM_L4x8_LOOP:
|
|
/*----------------------------------------*/
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL4x8_L2 128,64,0,0
|
|
CGEMM_L4x8_K128:
|
|
/*----------------------------------------*/
|
|
KERNEL4x8_L2 128,64,1,0
|
|
dcbt AO, T2
|
|
KERNEL4x8_L2 128,64,2,0
|
|
KERNEL4x8_L2 128,64,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL4x8_L2 128,64,4,0
|
|
KERNEL4x8_L2 128,64,5,0
|
|
dcbt AO, T4
|
|
KERNEL4x8_L2 128,64,6,0
|
|
KERNEL4x8_L2 128,64,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL4x8_L2 128,64,8,0
|
|
KERNEL4x8_L2 128,64,9,0
|
|
KERNEL4x8_L2 128,64,10,0
|
|
KERNEL4x8_L2 128,64,11,0
|
|
dcbt BO, T4
|
|
KERNEL4x8_L2 128,64,12,0
|
|
KERNEL4x8_L2 128,64,13,0
|
|
KERNEL4x8_L2 128,64,14,0
|
|
KERNEL4x8_L2 128,64,15,0
|
|
KERNEL4x8_L2 128,64,16,0
|
|
KERNEL4x8_L2 128,64,17,0
|
|
KERNEL4x8_L2 128,64,18,0
|
|
KERNEL4x8_L2 128,64,19,0
|
|
KERNEL4x8_L2 128,64,20,0
|
|
KERNEL4x8_L2 128,64,21,0
|
|
KERNEL4x8_L2 128,64,22,0
|
|
KERNEL4x8_L2 128,64,23,0
|
|
KERNEL4x8_L2 128,64,24,0
|
|
KERNEL4x8_L2 128,64,25,0
|
|
KERNEL4x8_L2 128,64,26,0
|
|
KERNEL4x8_L2 128,64,27,0
|
|
KERNEL4x8_L2 128,64,28,0
|
|
KERNEL4x8_L2 128,64,29,0
|
|
KERNEL4x8_L2 128,64,30,0
|
|
KERNEL4x8_L2 128,64,31,0
|
|
KERNEL4x8_L2 128,64,32,0
|
|
KERNEL4x8_L2 128,64,33,0
|
|
KERNEL4x8_L2 128,64,34,0
|
|
KERNEL4x8_L2 128,64,35,0
|
|
KERNEL4x8_L2 128,64,36,0
|
|
KERNEL4x8_L2 128,64,37,0
|
|
KERNEL4x8_L2 128,64,38,0
|
|
KERNEL4x8_L2 128,64,39,0
|
|
KERNEL4x8_L2 128,64,40,0
|
|
KERNEL4x8_L2 128,64,41,0
|
|
KERNEL4x8_L2 128,64,42,0
|
|
KERNEL4x8_L2 128,64,43,0
|
|
KERNEL4x8_L2 128,64,44,0
|
|
KERNEL4x8_L2 128,64,45,0
|
|
KERNEL4x8_L2 128,64,46,0
|
|
KERNEL4x8_L2 128,64,47,0
|
|
KERNEL4x8_L2 128,64,48,0
|
|
KERNEL4x8_L2 128,64,49,0
|
|
KERNEL4x8_L2 128,64,50,0
|
|
KERNEL4x8_L2 128,64,51,0
|
|
KERNEL4x8_L2 128,64,52,0
|
|
KERNEL4x8_L2 128,64,53,0
|
|
KERNEL4x8_L2 128,64,54,0
|
|
KERNEL4x8_L2 128,64,55,0
|
|
KERNEL4x8_L2 128,64,56,0
|
|
KERNEL4x8_L2 128,64,57,0
|
|
KERNEL4x8_L2 128,64,58,0
|
|
KERNEL4x8_L2 128,64,59,0
|
|
KERNEL4x8_L2 128,64,60,0
|
|
KERNEL4x8_L2 128,64,61,0
|
|
KERNEL4x8_L2 128,64,62,0
|
|
KERNEL4x8_L2 128,64,63,1
|
|
bdnz CGEMM_L4x8_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L4x8_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END4x8_2
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_4x8_L64_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL4x8_L2 128,64,0,0
|
|
KERNEL4x8_L2 128,64,1,0
|
|
dcbt AO, T2
|
|
KERNEL4x8_L2 128,64,2,0
|
|
KERNEL4x8_L2 128,64,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL4x8_L2 128,64,4,0
|
|
KERNEL4x8_L2 128,64,5,0
|
|
dcbt AO, T4
|
|
KERNEL4x8_L2 128,64,6,0
|
|
KERNEL4x8_L2 128,64,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL4x8_L2 128,64,8,0
|
|
KERNEL4x8_L2 128,64,9,0
|
|
KERNEL4x8_L2 128,64,10,0
|
|
KERNEL4x8_L2 128,64,11,0
|
|
dcbt BO, T4
|
|
KERNEL4x8_L2 128,64,12,0
|
|
KERNEL4x8_L2 128,64,13,0
|
|
KERNEL4x8_L2 128,64,14,0
|
|
KERNEL4x8_L2 128,64,15,0
|
|
KERNEL4x8_L2 128,64,16,0
|
|
KERNEL4x8_L2 128,64,17,0
|
|
KERNEL4x8_L2 128,64,18,0
|
|
KERNEL4x8_L2 128,64,19,0
|
|
KERNEL4x8_L2 128,64,20,0
|
|
KERNEL4x8_L2 128,64,21,0
|
|
KERNEL4x8_L2 128,64,22,0
|
|
KERNEL4x8_L2 128,64,23,0
|
|
KERNEL4x8_L2 128,64,24,0
|
|
KERNEL4x8_L2 128,64,25,0
|
|
KERNEL4x8_L2 128,64,26,0
|
|
KERNEL4x8_L2 128,64,27,0
|
|
KERNEL4x8_L2 128,64,28,0
|
|
KERNEL4x8_L2 128,64,29,0
|
|
KERNEL4x8_L2 128,64,30,0
|
|
KERNEL4x8_E2 128,64,31,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_4x8_L32_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL4x8_L2 128,64,0,0
|
|
KERNEL4x8_L2 128,64,1,0
|
|
dcbt AO, T2
|
|
KERNEL4x8_L2 128,64,2,0
|
|
KERNEL4x8_L2 128,64,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL4x8_L2 128,64,4,0
|
|
KERNEL4x8_L2 128,64,5,0
|
|
dcbt AO, T4
|
|
KERNEL4x8_L2 128,64,6,0
|
|
KERNEL4x8_L2 128,64,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL4x8_L2 128,64,8,0
|
|
KERNEL4x8_L2 128,64,9,0
|
|
KERNEL4x8_L2 128,64,10,0
|
|
KERNEL4x8_L2 128,64,11,0
|
|
dcbt BO, T4
|
|
KERNEL4x8_L2 128,64,12,0
|
|
KERNEL4x8_L2 128,64,13,0
|
|
KERNEL4x8_L2 128,64,14,0
|
|
KERNEL4x8_E2 128,64,15,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_4x8_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL4x8_L2 128,64,0,0
|
|
KERNEL4x8_L2 128,64,1,0
|
|
dcbt AO, T2
|
|
KERNEL4x8_L2 128,64,2,0
|
|
KERNEL4x8_L2 128,64,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL4x8_L2 128,64,4,0
|
|
KERNEL4x8_L2 128,64,5,0
|
|
dcbt AO, T4
|
|
KERNEL4x8_L2 128,64,6,0
|
|
KERNEL4x8_E2 128,64,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_4x4_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD4x4_2
|
|
MY_ALIGN
|
|
CGEMM_L4x4_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL4x4_L2 64,64,0,0
|
|
CGEMM_L4x4_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL4x4_L2 64,64,1,0
|
|
KERNEL4x4_L2 64,64,2,0
|
|
KERNEL4x4_L2 64,64,3,0
|
|
KERNEL4x4_L2 64,64,4,0
|
|
KERNEL4x4_L2 64,64,5,0
|
|
KERNEL4x4_L2 64,64,6,0
|
|
KERNEL4x4_L2 64,64,7,0
|
|
KERNEL4x4_L2 64,64,8,0
|
|
KERNEL4x4_L2 64,64,9,0
|
|
KERNEL4x4_L2 64,64,10,0
|
|
KERNEL4x4_L2 64,64,11,0
|
|
KERNEL4x4_L2 64,64,12,0
|
|
KERNEL4x4_L2 64,64,13,0
|
|
KERNEL4x4_L2 64,64,14,0
|
|
KERNEL4x4_L2 64,64,15,1
|
|
bdnz CGEMM_L4x4_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L4x4_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END4x4_2
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_4x4_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x4_2
|
|
KERNEL4x4_L2 64,64,0,0
|
|
KERNEL4x4_L2 64,64,1,0
|
|
KERNEL4x4_L2 64,64,2,0
|
|
KERNEL4x4_L2 64,64,3,0
|
|
KERNEL4x4_L2 64,64,4,0
|
|
KERNEL4x4_L2 64,64,5,0
|
|
KERNEL4x4_L2 64,64,6,0
|
|
KERNEL4x4_E2 64,64,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_4x4_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x4_2
|
|
KERNEL4x4_L2 64,64,0,0
|
|
KERNEL4x4_L2 64,64,1,0
|
|
KERNEL4x4_L2 64,64,2,0
|
|
KERNEL4x4_E2 64,64,3,1
|
|
blr
|
|
|
|
|
|
CGEMM_4x2_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD4x2_2
|
|
MY_ALIGN
|
|
CGEMM_L4x2_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL4x2_L2 32,64,0,0
|
|
CGEMM_L4x2_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL4x2_L2 32,64,1,0
|
|
KERNEL4x2_L2 32,64,2,0
|
|
KERNEL4x2_L2 32,64,3,0
|
|
KERNEL4x2_L2 32,64,4,0
|
|
KERNEL4x2_L2 32,64,5,0
|
|
KERNEL4x2_L2 32,64,6,0
|
|
KERNEL4x2_L2 32,64,7,0
|
|
KERNEL4x2_L2 32,64,8,0
|
|
KERNEL4x2_L2 32,64,9,0
|
|
KERNEL4x2_L2 32,64,10,0
|
|
KERNEL4x2_L2 32,64,11,0
|
|
KERNEL4x2_L2 32,64,12,0
|
|
KERNEL4x2_L2 32,64,13,0
|
|
KERNEL4x2_L2 32,64,14,0
|
|
KERNEL4x2_L2 32,64,15,1
|
|
bdnz CGEMM_L4x2_LOOP
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x2_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END4x2_2
|
|
blr
|
|
MY_ALIGN
|
|
CGEMM_4x2_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x2_2
|
|
KERNEL4x2_L2 32,64,0,0
|
|
KERNEL4x2_L2 32,64,1,0
|
|
KERNEL4x2_L2 32,64,2,0
|
|
KERNEL4x2_L2 32,64,3,0
|
|
KERNEL4x2_L2 32,64,4,0
|
|
KERNEL4x2_L2 32,64,5,0
|
|
KERNEL4x2_L2 32,64,6,0
|
|
KERNEL4x2_E2 32,64,7,1
|
|
blr
|
|
MY_ALIGN
|
|
CGEMM_4x2_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x2_2
|
|
KERNEL4x2_L2 32,64,0,0
|
|
KERNEL4x2_L2 32,64,1,0
|
|
KERNEL4x2_L2 32,64,2,0
|
|
KERNEL4x2_E2 32,64,3,1
|
|
blr
|
|
|
|
|
|
CGEMM_4x1_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD4x1_2
|
|
MY_ALIGN
|
|
CGEMM_L4x1_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL4x1_L2 16,64,0,0
|
|
CGEMM_L4x1_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL4x1_L2 16,64,1,0
|
|
KERNEL4x1_L2 16,64,2,0
|
|
KERNEL4x1_L2 16,64,3,0
|
|
KERNEL4x1_L2 16,64,4,0
|
|
KERNEL4x1_L2 16,64,5,0
|
|
KERNEL4x1_L2 16,64,6,0
|
|
KERNEL4x1_L2 16,64,7,0
|
|
KERNEL4x1_L2 16,64,8,0
|
|
KERNEL4x1_L2 16,64,9,0
|
|
KERNEL4x1_L2 16,64,10,0
|
|
KERNEL4x1_L2 16,64,11,0
|
|
KERNEL4x1_L2 16,64,12,0
|
|
KERNEL4x1_L2 16,64,13,0
|
|
KERNEL4x1_L2 16,64,14,0
|
|
KERNEL4x1_L2 16,64,15,1
|
|
bdnz CGEMM_L4x1_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L4x1_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END4x1_2
|
|
blr
|
|
|
|
MY_ALIGN
|
|
CGEMM_4x1_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x1_2
|
|
KERNEL4x1_L2 16,64,0,0
|
|
KERNEL4x1_L2 16,64,1,0
|
|
KERNEL4x1_L2 16,64,2,0
|
|
KERNEL4x1_L2 16,64,3,0
|
|
KERNEL4x1_L2 16,64,4,0
|
|
KERNEL4x1_L2 16,64,5,0
|
|
KERNEL4x1_L2 16,64,6,0
|
|
KERNEL4x1_E2 16,64,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_4x1_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD4x1_2
|
|
KERNEL4x1_L2 16,64,0,0
|
|
KERNEL4x1_L2 16,64,1,0
|
|
KERNEL4x1_L2 16,64,2,0
|
|
KERNEL4x1_E2 16,64,3,1
|
|
blr
|
|
|
|
|
|
|
|
/* MAIN LOOP BEGINS */
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
neg TEMP_REG, OFFSET
|
|
#endif
|
|
/* Pre set value in vs57 as 0xffff0000ffff0000 for masking */
|
|
vspltisb v24, -1
|
|
vspltisb v25, 0
|
|
xxsldwi vs57, vs56, vs57, 1
|
|
xxpermdi vs57, vs57, vs57, 3
|
|
srawi. J, N, 2
|
|
ble CGEMM_L4_END
|
|
|
|
|
|
CGEMM_L4_BEGIN:
|
|
/*----------------------------------------*/
|
|
mr CO, C
|
|
slwi T1, LDC , 2
|
|
add T2,C,LDC
|
|
mr AO, A
|
|
add C, C, T1
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
mr TEMP_REG, OFFSET /*off = offset;*/
|
|
#endif
|
|
srawi. I, M, 3
|
|
ble CGEMM_L4x8_END
|
|
dcbt CO,r0 /*just prefetch*/
|
|
dcbt T2,r0
|
|
|
|
|
|
CGEMM_L4x8_BEGIN:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
|
|
#else
|
|
mr BO, B
|
|
dcbt B, r0
|
|
#endif
|
|
dcbt AO, r0
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
|
|
mr T1, T6
|
|
/* TEMPS FOR PREFETCH */
|
|
li T2, 1024
|
|
li T3, 1024+512
|
|
addi T1,T1, -2
|
|
/* TEMPS FOR PREFETCH */
|
|
li T4, 2048
|
|
li T5, 2048+512
|
|
srawi. T8, T1, 7 /**(T1-2) % 128x */
|
|
#else
|
|
mr T1, K
|
|
/* TEMPS FOR PREFETCH */
|
|
li T2, 1024
|
|
li T3, 1024+512
|
|
addi T1,T1, -2
|
|
/* TEMPS FOR PREFETCH */
|
|
li T4, 2048
|
|
li T5, 2048+512
|
|
srawi. T8, T1, 7 /**(K-2) % 128x */
|
|
#endif
|
|
ZERO4x8
|
|
ble CGEMM_L4x8_SUB0
|
|
bl CGEMM_L4x8_LMAIN_SUB
|
|
andi. L, T1, 127
|
|
ble CGEMM_L4x8_SAVE
|
|
b CGEMM_L4x8_SUB2
|
|
|
|
|
|
CGEMM_L4x8_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 255
|
|
cmpwi T6,129
|
|
#else
|
|
andi. L, K, 255
|
|
cmpwi K,129
|
|
#endif
|
|
li T8,1
|
|
bne CMP4x8_128K
|
|
addi BO,BO,-32
|
|
addi AO,AO,-64
|
|
LOAD4x8O 64,32
|
|
END4x8_WITHOUT_ADD
|
|
LOAD4x8_2O 128, 64
|
|
mtctr T8
|
|
bl CGEMM_L4x8_K128
|
|
b CGEMM_L4x8_SAVE
|
|
CMP4x8_128K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,128
|
|
#else
|
|
cmpwi K,128
|
|
#endif
|
|
bne CGEMM_L4x8_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-64
|
|
addi AO,AO,-128
|
|
LOAD4x8_2O 128,64
|
|
bl CGEMM_L4x8_K128
|
|
b CGEMM_L4x8_SAVE
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 64
|
|
ble CGEMM_L4x8_SUB2_32
|
|
bl CGEMM_4x8_L64_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_SUB2_32:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 32
|
|
ble CGEMM_L4x8_SUB2_16
|
|
bl CGEMM_4x8_L32_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_SUB2_16:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L4x8_SUB2_8
|
|
bl CGEMM_4x8_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L4x8_SUB2_4
|
|
LOAD4x8_2
|
|
KERNEL4x8_L2 128,64, 0,0
|
|
KERNEL4x8_L2 128,64, 1,0
|
|
KERNEL4x8_L2 128,64, 2,0
|
|
KERNEL4x8_E2 128,64, 3,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L4x8_SUB2_2
|
|
LOAD4x8_2
|
|
KERNEL4x8_L2 128,64, 0,0
|
|
KERNEL4x8_E2 128,64, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L4x8_SUB2_1
|
|
LOAD4x8_2
|
|
KERNEL4x8_E2 128,64, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L4x8_SAVE
|
|
KERNEL4x8
|
|
|
|
MY_ALIGN
|
|
CGEMM_L4x8_SAVE:
|
|
/*----------------------------------------*/
|
|
addic. I, I, -1
|
|
MY_ALIGN
|
|
SAVE4x8
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
|
|
#endif
|
|
bgt CGEMM_L4x8_BEGIN
|
|
andi. T2, M, 7
|
|
ble CGEMM_L4x1_END
|
|
andi. T1, M, 4
|
|
ble CGEMM_L4x4_END
|
|
b CGEMM_L4x4_BEGIN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x8_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L4x4_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T2, M, 7
|
|
ble CGEMM_L4x1_END
|
|
andi. T1, M, 4
|
|
ble CGEMM_L4x4_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 32x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 32x */
|
|
#endif
|
|
ZERO4x4
|
|
ble CGEMM_L4x4_SUB0
|
|
bl CGEMM_4x4_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L4x4_SAVE
|
|
b CGEMM_L4x4_SUB2
|
|
|
|
|
|
CGEMM_L4x4_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP4x4_32K
|
|
addi BO,BO,-32
|
|
addi AO,AO,-32
|
|
LOAD4x4O 32,32
|
|
END4x4_WITHOUT_ADD
|
|
LOAD4x4_2O 64, 64
|
|
mtctr T8
|
|
bl CGEMM_L4x4_K32
|
|
b CGEMM_L4x4_SAVE
|
|
CMP4x4_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L4x4_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-64
|
|
addi AO,AO,-64
|
|
LOAD4x4_2O 64,64
|
|
bl CGEMM_L4x4_K32
|
|
b CGEMM_L4x4_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x4_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L4x4_SUB2_8
|
|
bl CGEMM_4x4_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x4_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L4x4_SUB2_4
|
|
bl CGEMM_4x4_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x4_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L4x4_SUB2_2
|
|
LOAD4x4_2
|
|
KERNEL4x4_L2 64,64, 0,0
|
|
KERNEL4x4_E2 64,64, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x4_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L4x4_SUB2_1
|
|
LOAD4x4_2
|
|
KERNEL4x4_E2 64,64, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x4_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L4x4_SAVE
|
|
KERNEL4x4
|
|
|
|
|
|
CGEMM_L4x4_SAVE:
|
|
/*----------------------------------------*/
|
|
SAVE4x4
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
|
|
#endif
|
|
|
|
|
|
CGEMM_L4x4_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L4x2_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T1, M, 2
|
|
ble CGEMM_L4x2_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 32x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 32x */
|
|
#endif
|
|
ZERO4x2
|
|
ble CGEMM_L4x2_SUB0
|
|
bl CGEMM_4x2_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L4x2_SAVE
|
|
b CGEMM_L4x2_SUB2
|
|
|
|
|
|
CGEMM_L4x2_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP4x2_32K
|
|
addi BO,BO,-32
|
|
addi AO,AO,-16
|
|
LOAD4x2O 16,32
|
|
END4x2_WITHOUT_ADD
|
|
LOAD4x2_2O 32, 64
|
|
mtctr T8
|
|
bl CGEMM_L4x2_K32
|
|
b CGEMM_L4x2_SAVE
|
|
CMP4x2_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L4x2_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-64
|
|
addi AO,AO,-32
|
|
LOAD4x2_2O 32,64
|
|
bl CGEMM_L4x2_K32
|
|
b CGEMM_L4x2_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x2_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L4x2_SUB2_8
|
|
bl CGEMM_4x2_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x2_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L4x2_SUB2_4
|
|
bl CGEMM_4x2_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x2_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L4x2_SUB2_2
|
|
LOAD4x2_2
|
|
KERNEL4x2_L2 32,64, 0,0
|
|
KERNEL4x2_E2 32,64, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x2_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L4x2_SUB2_1
|
|
LOAD4x2_2
|
|
KERNEL4x2_E2 32,64, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x2_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L4x2_SAVE
|
|
KERNEL4x2
|
|
|
|
MY_ALIGN
|
|
CGEMM_L4x2_SAVE:
|
|
/*----------------------------------------*/
|
|
SAVE4x2
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
|
|
#endif
|
|
|
|
|
|
CGEMM_L4x2_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L4x1_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T1, M, 1
|
|
ble CGEMM_L4x1_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 32x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 32x */
|
|
#endif
|
|
ZERO4x1
|
|
ble CGEMM_L4x1_SUB0
|
|
bl CGEMM_4x1_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L4x1_SAVE
|
|
b CGEMM_L4x1_SUB2
|
|
|
|
|
|
CGEMM_L4x1_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP4x1_32K
|
|
addi BO,BO,-32
|
|
addi AO,AO,-8
|
|
LOAD4x1O 8,32
|
|
END4x1_WITHOUT_ADD
|
|
LOAD4x1_2O 16, 64
|
|
mtctr T8
|
|
bl CGEMM_L4x1_K32
|
|
b CGEMM_L4x1_SAVE
|
|
CMP4x1_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L4x1_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-64
|
|
addi AO,AO,-16
|
|
LOAD4x1_2O 16,64
|
|
bl CGEMM_L4x1_K32
|
|
b CGEMM_L4x1_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x1_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L4x1_SUB2_8
|
|
bl CGEMM_4x1_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x1_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L4x1_SUB2_4
|
|
bl CGEMM_4x1_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x1_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L4x1_SUB2_2
|
|
LOAD4x1_2
|
|
KERNEL4x1_L2 16,64, 0,0
|
|
KERNEL4x1_E2 16,64, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x1_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L4x1_SUB2_1
|
|
LOAD4x1_2
|
|
KERNEL4x1_E2 16,64, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L4x1_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L4x1_SAVE
|
|
KERNEL4x1
|
|
|
|
MY_ALIGN
|
|
CGEMM_L4x1_SAVE:
|
|
/*----------------------------------------*/
|
|
|
|
SAVE4x1
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
|
|
#endif
|
|
|
|
|
|
CGEMM_L4x1_END:
|
|
/*----------------------------------------*/
|
|
slwi T1, K, 5
|
|
addic. J, J, -1
|
|
add B, B, T1
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
addi TEMP_REG, TEMP_REG, 4
|
|
#endif
|
|
bgt CGEMM_L4_BEGIN
|
|
|
|
|
|
CGEMM_L4_END:
|
|
|
|
b CGEMM_L2
|
|
/* MINI SUBROUTINES */
|
|
/* 2x8 MAIN 128x+2 LOOP */
|
|
|
|
|
|
CGEMM_L2x8_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD2x8_2
|
|
MY_ALIGN
|
|
CGEMM_L2x8_LOOP:
|
|
/*----------------------------------------*/
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL2x8_L2 128,32,0,0
|
|
CGEMM_L2x8_K128:
|
|
/*----------------------------------------*/
|
|
KERNEL2x8_L2 128,32,1,0
|
|
dcbt AO, T2
|
|
KERNEL2x8_L2 128,32,2,0
|
|
KERNEL2x8_L2 128,32,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL2x8_L2 128,32,4,0
|
|
KERNEL2x8_L2 128,32,5,0
|
|
dcbt AO, T4
|
|
KERNEL2x8_L2 128,32,6,0
|
|
KERNEL2x8_L2 128,32,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL2x8_L2 128,32,8,0
|
|
KERNEL2x8_L2 128,32,9,0
|
|
KERNEL2x8_L2 128,32,10,0
|
|
KERNEL2x8_L2 128,32,11,0
|
|
dcbt BO, T4
|
|
KERNEL2x8_L2 128,32,12,0
|
|
KERNEL2x8_L2 128,32,13,0
|
|
KERNEL2x8_L2 128,32,14,0
|
|
KERNEL2x8_L2 128,32,15,0
|
|
KERNEL2x8_L2 128,32,16,0
|
|
KERNEL2x8_L2 128,32,17,0
|
|
KERNEL2x8_L2 128,32,18,0
|
|
KERNEL2x8_L2 128,32,19,0
|
|
KERNEL2x8_L2 128,32,20,0
|
|
KERNEL2x8_L2 128,32,21,0
|
|
KERNEL2x8_L2 128,32,22,0
|
|
KERNEL2x8_L2 128,32,23,0
|
|
KERNEL2x8_L2 128,32,24,0
|
|
KERNEL2x8_L2 128,32,25,0
|
|
KERNEL2x8_L2 128,32,26,0
|
|
KERNEL2x8_L2 128,32,27,0
|
|
KERNEL2x8_L2 128,32,28,0
|
|
KERNEL2x8_L2 128,32,29,0
|
|
KERNEL2x8_L2 128,32,30,0
|
|
KERNEL2x8_L2 128,32,31,0
|
|
KERNEL2x8_L2 128,32,32,0
|
|
KERNEL2x8_L2 128,32,33,0
|
|
KERNEL2x8_L2 128,32,34,0
|
|
KERNEL2x8_L2 128,32,35,0
|
|
KERNEL2x8_L2 128,32,36,0
|
|
KERNEL2x8_L2 128,32,37,0
|
|
KERNEL2x8_L2 128,32,38,0
|
|
KERNEL2x8_L2 128,32,39,0
|
|
KERNEL2x8_L2 128,32,40,0
|
|
KERNEL2x8_L2 128,32,41,0
|
|
KERNEL2x8_L2 128,32,42,0
|
|
KERNEL2x8_L2 128,32,43,0
|
|
KERNEL2x8_L2 128,32,44,0
|
|
KERNEL2x8_L2 128,32,45,0
|
|
KERNEL2x8_L2 128,32,46,0
|
|
KERNEL2x8_L2 128,32,47,0
|
|
KERNEL2x8_L2 128,32,48,0
|
|
KERNEL2x8_L2 128,32,49,0
|
|
KERNEL2x8_L2 128,32,50,0
|
|
KERNEL2x8_L2 128,32,51,0
|
|
KERNEL2x8_L2 128,32,52,0
|
|
KERNEL2x8_L2 128,32,53,0
|
|
KERNEL2x8_L2 128,32,54,0
|
|
KERNEL2x8_L2 128,32,55,0
|
|
KERNEL2x8_L2 128,32,56,0
|
|
KERNEL2x8_L2 128,32,57,0
|
|
KERNEL2x8_L2 128,32,58,0
|
|
KERNEL2x8_L2 128,32,59,0
|
|
KERNEL2x8_L2 128,32,60,0
|
|
KERNEL2x8_L2 128,32,61,0
|
|
KERNEL2x8_L2 128,32,62,0
|
|
KERNEL2x8_L2 128,32,63,1
|
|
bdnz CGEMM_L2x8_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L2x8_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END2x8_2
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_2x8_L64_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL2x8_L2 128,32,0,0
|
|
KERNEL2x8_L2 128,32,1,0
|
|
dcbt AO, T2
|
|
KERNEL2x8_L2 128,32,2,0
|
|
KERNEL2x8_L2 128,32,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL2x8_L2 128,32,4,0
|
|
KERNEL2x8_L2 128,32,5,0
|
|
dcbt AO, T4
|
|
KERNEL2x8_L2 128,32,6,0
|
|
KERNEL2x8_L2 128,32,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL2x8_L2 128,32,8,0
|
|
KERNEL2x8_L2 128,32,9,0
|
|
KERNEL2x8_L2 128,32,10,0
|
|
KERNEL2x8_L2 128,32,11,0
|
|
dcbt BO, T4
|
|
KERNEL2x8_L2 128,32,12,0
|
|
KERNEL2x8_L2 128,32,13,0
|
|
KERNEL2x8_L2 128,32,14,0
|
|
KERNEL2x8_L2 128,32,15,0
|
|
KERNEL2x8_L2 128,32,16,0
|
|
KERNEL2x8_L2 128,32,17,0
|
|
KERNEL2x8_L2 128,32,18,0
|
|
KERNEL2x8_L2 128,32,19,0
|
|
KERNEL2x8_L2 128,32,20,0
|
|
KERNEL2x8_L2 128,32,21,0
|
|
KERNEL2x8_L2 128,32,22,0
|
|
KERNEL2x8_L2 128,32,23,0
|
|
KERNEL2x8_L2 128,32,24,0
|
|
KERNEL2x8_L2 128,32,25,0
|
|
KERNEL2x8_L2 128,32,26,0
|
|
KERNEL2x8_L2 128,32,27,0
|
|
KERNEL2x8_L2 128,32,28,0
|
|
KERNEL2x8_L2 128,32,29,0
|
|
KERNEL2x8_L2 128,32,30,0
|
|
KERNEL2x8_E2 128,32,31,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_2x8_L32_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL2x8_L2 128,32,0,0
|
|
KERNEL2x8_L2 128,32,1,0
|
|
dcbt AO, T2
|
|
KERNEL2x8_L2 128,32,2,0
|
|
KERNEL2x8_L2 128,32,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL2x8_L2 128,32,4,0
|
|
KERNEL2x8_L2 128,32,5,0
|
|
dcbt AO, T4
|
|
KERNEL2x8_L2 128,32,6,0
|
|
KERNEL2x8_L2 128,32,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL2x8_L2 128,32,8,0
|
|
KERNEL2x8_L2 128,32,9,0
|
|
KERNEL2x8_L2 128,32,10,0
|
|
KERNEL2x8_L2 128,32,11,0
|
|
dcbt BO, T4
|
|
KERNEL2x8_L2 128,32,12,0
|
|
KERNEL2x8_L2 128,32,13,0
|
|
KERNEL2x8_L2 128,32,14,0
|
|
KERNEL2x8_E2 128,32,15,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_2x8_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL2x8_L2 128,32,0,0
|
|
KERNEL2x8_L2 128,32,1,0
|
|
dcbt AO, T2
|
|
KERNEL2x8_L2 128,32,2,0
|
|
KERNEL2x8_L2 128,32,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL2x8_L2 128,32,4,0
|
|
KERNEL2x8_L2 128,32,5,0
|
|
dcbt AO, T4
|
|
KERNEL2x8_L2 128,32,6,0
|
|
KERNEL2x8_E2 128,32,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_2x4_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD2x4_2
|
|
MY_ALIGN
|
|
CGEMM_L2x4_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL2x4_L2 64,32,0,0
|
|
CGEMM_L2x4_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL2x4_L2 64,32,1,0
|
|
KERNEL2x4_L2 64,32,2,0
|
|
KERNEL2x4_L2 64,32,3,0
|
|
KERNEL2x4_L2 64,32,4,0
|
|
KERNEL2x4_L2 64,32,5,0
|
|
KERNEL2x4_L2 64,32,6,0
|
|
KERNEL2x4_L2 64,32,7,0
|
|
KERNEL2x4_L2 64,32,8,0
|
|
KERNEL2x4_L2 64,32,9,0
|
|
KERNEL2x4_L2 64,32,10,0
|
|
KERNEL2x4_L2 64,32,11,0
|
|
KERNEL2x4_L2 64,32,12,0
|
|
KERNEL2x4_L2 64,32,13,0
|
|
KERNEL2x4_L2 64,32,14,0
|
|
KERNEL2x4_L2 64,32,15,1
|
|
bdnz CGEMM_L2x4_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L2x4_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END2x4_2
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_2x4_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x4_2
|
|
KERNEL2x4_L2 64,32,0,0
|
|
KERNEL2x4_L2 64,32,1,0
|
|
KERNEL2x4_L2 64,32,2,0
|
|
KERNEL2x4_L2 64,32,3,0
|
|
KERNEL2x4_L2 64,32,4,0
|
|
KERNEL2x4_L2 64,32,5,0
|
|
KERNEL2x4_L2 64,32,6,0
|
|
KERNEL2x4_E2 64,32,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_2x4_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x4_2
|
|
KERNEL2x4_L2 64,32,0,0
|
|
KERNEL2x4_L2 64,32,1,0
|
|
KERNEL2x4_L2 64,32,2,0
|
|
KERNEL2x4_E2 64,32,3,1
|
|
blr
|
|
|
|
|
|
CGEMM_2x2_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD2x2_2
|
|
MY_ALIGN
|
|
CGEMM_L2x2_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL2x2_L2 32,32,0,0
|
|
CGEMM_L2x2_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL2x2_L2 32,32,1,0
|
|
KERNEL2x2_L2 32,32,2,0
|
|
KERNEL2x2_L2 32,32,3,0
|
|
KERNEL2x2_L2 32,32,4,0
|
|
KERNEL2x2_L2 32,32,5,0
|
|
KERNEL2x2_L2 32,32,6,0
|
|
KERNEL2x2_L2 32,32,7,0
|
|
KERNEL2x2_L2 32,32,8,0
|
|
KERNEL2x2_L2 32,32,9,0
|
|
KERNEL2x2_L2 32,32,10,0
|
|
KERNEL2x2_L2 32,32,11,0
|
|
KERNEL2x2_L2 32,32,12,0
|
|
KERNEL2x2_L2 32,32,13,0
|
|
KERNEL2x2_L2 32,32,14,0
|
|
KERNEL2x2_L2 32,32,15,1
|
|
bdnz CGEMM_L2x2_LOOP
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x2_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END2x2_2
|
|
blr
|
|
MY_ALIGN
|
|
CGEMM_2x2_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x2_2
|
|
KERNEL2x2_L2 32,32,0,0
|
|
KERNEL2x2_L2 32,32,1,0
|
|
KERNEL2x2_L2 32,32,2,0
|
|
KERNEL2x2_L2 32,32,3,0
|
|
KERNEL2x2_L2 32,32,4,0
|
|
KERNEL2x2_L2 32,32,5,0
|
|
KERNEL2x2_L2 32,32,6,0
|
|
KERNEL2x2_E2 32,32,7,1
|
|
blr
|
|
MY_ALIGN
|
|
CGEMM_2x2_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x2_2
|
|
KERNEL2x2_L2 32,32,0,0
|
|
KERNEL2x2_L2 32,32,1,0
|
|
KERNEL2x2_L2 32,32,2,0
|
|
KERNEL2x2_E2 32,32,3,1
|
|
blr
|
|
|
|
|
|
CGEMM_2x1_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD2x1_2
|
|
MY_ALIGN
|
|
CGEMM_L2x1_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL2x1_L2 16,32,0,0
|
|
CGEMM_L2x1_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL2x1_L2 16,32,1,0
|
|
KERNEL2x1_L2 16,32,2,0
|
|
KERNEL2x1_L2 16,32,3,0
|
|
KERNEL2x1_L2 16,32,4,0
|
|
KERNEL2x1_L2 16,32,5,0
|
|
KERNEL2x1_L2 16,32,6,0
|
|
KERNEL2x1_L2 16,32,7,0
|
|
KERNEL2x1_L2 16,32,8,0
|
|
KERNEL2x1_L2 16,32,9,0
|
|
KERNEL2x1_L2 16,32,10,0
|
|
KERNEL2x1_L2 16,32,11,0
|
|
KERNEL2x1_L2 16,32,12,0
|
|
KERNEL2x1_L2 16,32,13,0
|
|
KERNEL2x1_L2 16,32,14,0
|
|
KERNEL2x1_L2 16,32,15,1
|
|
bdnz CGEMM_L2x1_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L2x1_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END2x1_2
|
|
blr
|
|
|
|
MY_ALIGN
|
|
CGEMM_2x1_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x1_2
|
|
KERNEL2x1_L2 16,32,0,0
|
|
KERNEL2x1_L2 16,32,1,0
|
|
KERNEL2x1_L2 16,32,2,0
|
|
KERNEL2x1_L2 16,32,3,0
|
|
KERNEL2x1_L2 16,32,4,0
|
|
KERNEL2x1_L2 16,32,5,0
|
|
KERNEL2x1_L2 16,32,6,0
|
|
KERNEL2x1_E2 16,32,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_2x1_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD2x1_2
|
|
KERNEL2x1_L2 16,32,0,0
|
|
KERNEL2x1_L2 16,32,1,0
|
|
KERNEL2x1_L2 16,32,2,0
|
|
KERNEL2x1_E2 16,32,3,1
|
|
blr
|
|
|
|
|
|
|
|
/* MAIN LOOP BEGINS */
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2:
|
|
/*----------------------------------------*/
|
|
|
|
andi. J, N, 2
|
|
ble CGEMM_L2_END
|
|
|
|
|
|
CGEMM_L2_BEGIN:
|
|
/*----------------------------------------*/
|
|
mr CO, C
|
|
slwi T1, LDC , 1
|
|
add T2,C,LDC
|
|
mr AO, A
|
|
add C, C, T1
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
mr TEMP_REG, OFFSET /*off = offset;*/
|
|
#endif
|
|
srawi. I, M, 3
|
|
ble CGEMM_L2x8_END
|
|
dcbt CO,r0 /*just prefetch*/
|
|
dcbt T2,r0
|
|
|
|
|
|
CGEMM_L2x8_BEGIN:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
|
|
#else
|
|
mr BO, B
|
|
dcbt B, r0
|
|
#endif
|
|
dcbt AO, r0
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
|
|
mr T1, T6
|
|
/* TEMPS FOR PREFETCH */
|
|
li T2, 1024
|
|
li T3, 1024+512
|
|
addi T1,T1, -2
|
|
/* TEMPS FOR PREFETCH */
|
|
li T4, 2048
|
|
li T5, 2048+512
|
|
srawi. T8, T1, 7 /**(T1-2) % 128x */
|
|
#else
|
|
mr T1, K
|
|
/* TEMPS FOR PREFETCH */
|
|
li T2, 1024
|
|
li T3, 1024+512
|
|
addi T1,T1, -2
|
|
/* TEMPS FOR PREFETCH */
|
|
li T4, 2048
|
|
li T5, 2048+512
|
|
srawi. T8, T1, 7 /**(K-2) % 128x */
|
|
#endif
|
|
ZERO2x8
|
|
ble CGEMM_L2x8_SUB0
|
|
bl CGEMM_L2x8_LMAIN_SUB
|
|
andi. L, T1, 127
|
|
ble CGEMM_L2x8_SAVE
|
|
b CGEMM_L2x8_SUB2
|
|
|
|
|
|
CGEMM_L2x8_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 255
|
|
cmpwi T6,129
|
|
#else
|
|
andi. L, K, 255
|
|
cmpwi K,129
|
|
#endif
|
|
li T8,1
|
|
bne CMP2x8_128K
|
|
addi BO,BO,-16
|
|
addi AO,AO,-64
|
|
LOAD2x8O 64,16
|
|
END2x8_WITHOUT_ADD
|
|
LOAD2x8_2O 128, 32
|
|
mtctr T8
|
|
bl CGEMM_L2x8_K128
|
|
b CGEMM_L2x8_SAVE
|
|
CMP2x8_128K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,128
|
|
#else
|
|
cmpwi K,128
|
|
#endif
|
|
bne CGEMM_L2x8_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-32
|
|
addi AO,AO,-128
|
|
LOAD2x8_2O 128,32
|
|
bl CGEMM_L2x8_K128
|
|
b CGEMM_L2x8_SAVE
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 64
|
|
ble CGEMM_L2x8_SUB2_32
|
|
bl CGEMM_2x8_L64_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_SUB2_32:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 32
|
|
ble CGEMM_L2x8_SUB2_16
|
|
bl CGEMM_2x8_L32_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_SUB2_16:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L2x8_SUB2_8
|
|
bl CGEMM_2x8_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L2x8_SUB2_4
|
|
LOAD2x8_2
|
|
KERNEL2x8_L2 128,32, 0,0
|
|
KERNEL2x8_L2 128,32, 1,0
|
|
KERNEL2x8_L2 128,32, 2,0
|
|
KERNEL2x8_E2 128,32, 3,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L2x8_SUB2_2
|
|
LOAD2x8_2
|
|
KERNEL2x8_L2 128,32, 0,0
|
|
KERNEL2x8_E2 128,32, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L2x8_SUB2_1
|
|
LOAD2x8_2
|
|
KERNEL2x8_E2 128,32, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L2x8_SAVE
|
|
KERNEL2x8
|
|
|
|
MY_ALIGN
|
|
CGEMM_L2x8_SAVE:
|
|
/*----------------------------------------*/
|
|
addic. I, I, -1
|
|
MY_ALIGN
|
|
SAVE2x8
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
|
|
#endif
|
|
bgt CGEMM_L2x8_BEGIN
|
|
andi. T2, M, 7
|
|
ble CGEMM_L2x1_END
|
|
andi. T1, M, 4
|
|
ble CGEMM_L2x4_END
|
|
b CGEMM_L2x4_BEGIN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x8_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L2x4_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T2, M, 7
|
|
ble CGEMM_L2x1_END
|
|
andi. T1, M, 4
|
|
ble CGEMM_L2x4_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 32x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 32x */
|
|
#endif
|
|
ZERO2x4
|
|
ble CGEMM_L2x4_SUB0
|
|
bl CGEMM_2x4_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L2x4_SAVE
|
|
b CGEMM_L2x4_SUB2
|
|
|
|
|
|
CGEMM_L2x4_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP2x4_32K
|
|
addi BO,BO,-16
|
|
addi AO,AO,-32
|
|
LOAD2x4O 32,16
|
|
END2x4_WITHOUT_ADD
|
|
LOAD2x4_2O 64, 32
|
|
mtctr T8
|
|
bl CGEMM_L2x4_K32
|
|
b CGEMM_L2x4_SAVE
|
|
CMP2x4_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L2x4_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-32
|
|
addi AO,AO,-64
|
|
LOAD2x4_2O 64,32
|
|
bl CGEMM_L2x4_K32
|
|
b CGEMM_L2x4_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x4_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L2x4_SUB2_8
|
|
bl CGEMM_2x4_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x4_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L2x4_SUB2_4
|
|
bl CGEMM_2x4_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x4_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L2x4_SUB2_2
|
|
LOAD2x4_2
|
|
KERNEL2x4_L2 64,32, 0,0
|
|
KERNEL2x4_E2 64,32, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x4_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L2x4_SUB2_1
|
|
LOAD2x4_2
|
|
KERNEL2x4_E2 64,32, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x4_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L2x4_SAVE
|
|
KERNEL2x4
|
|
|
|
|
|
CGEMM_L2x4_SAVE:
|
|
/*----------------------------------------*/
|
|
SAVE2x4
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
|
|
#endif
|
|
|
|
|
|
CGEMM_L2x4_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L2x2_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T1, M, 2
|
|
ble CGEMM_L2x2_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 32x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 32x */
|
|
#endif
|
|
ZERO2x2
|
|
ble CGEMM_L2x2_SUB0
|
|
bl CGEMM_2x2_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L2x2_SAVE
|
|
b CGEMM_L2x2_SUB2
|
|
|
|
|
|
CGEMM_L2x2_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP2x2_32K
|
|
addi BO,BO,-16
|
|
addi AO,AO,-16
|
|
LOAD2x2O 16,16
|
|
END2x2_WITHOUT_ADD
|
|
LOAD2x2_2O 32, 32
|
|
mtctr T8
|
|
bl CGEMM_L2x2_K32
|
|
b CGEMM_L2x2_SAVE
|
|
CMP2x2_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L2x2_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-32
|
|
addi AO,AO,-32
|
|
LOAD2x2_2O 32,32
|
|
bl CGEMM_L2x2_K32
|
|
b CGEMM_L2x2_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x2_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L2x2_SUB2_8
|
|
bl CGEMM_2x2_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x2_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L2x2_SUB2_4
|
|
bl CGEMM_2x2_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x2_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L2x2_SUB2_2
|
|
LOAD2x2_2
|
|
KERNEL2x2_L2 32,32, 0,0
|
|
KERNEL2x2_E2 32,32, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x2_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L2x2_SUB2_1
|
|
LOAD2x2_2
|
|
KERNEL2x2_E2 32,32, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x2_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L2x2_SAVE
|
|
KERNEL2x2
|
|
|
|
MY_ALIGN
|
|
CGEMM_L2x2_SAVE:
|
|
/*----------------------------------------*/
|
|
SAVE2x2
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
|
|
#endif
|
|
|
|
|
|
CGEMM_L2x2_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L2x1_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T1, M, 1
|
|
ble CGEMM_L2x1_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 32x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 32x */
|
|
#endif
|
|
ZERO2x1
|
|
ble CGEMM_L2x1_SUB0
|
|
bl CGEMM_2x1_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L2x1_SAVE
|
|
b CGEMM_L2x1_SUB2
|
|
|
|
|
|
CGEMM_L2x1_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP2x1_32K
|
|
addi BO,BO,-16
|
|
addi AO,AO,-8
|
|
LOAD2x1O 8,16
|
|
END2x1_WITHOUT_ADD
|
|
LOAD2x1_2O 16, 32
|
|
mtctr T8
|
|
bl CGEMM_L2x1_K32
|
|
b CGEMM_L2x1_SAVE
|
|
CMP2x1_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L2x1_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-32
|
|
addi AO,AO,-16
|
|
LOAD2x1_2O 16,32
|
|
bl CGEMM_L2x1_K32
|
|
b CGEMM_L2x1_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x1_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L2x1_SUB2_8
|
|
bl CGEMM_2x1_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x1_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L2x1_SUB2_4
|
|
bl CGEMM_2x1_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x1_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L2x1_SUB2_2
|
|
LOAD2x1_2
|
|
KERNEL2x1_L2 16,32, 0,0
|
|
KERNEL2x1_E2 16,32, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x1_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L2x1_SUB2_1
|
|
LOAD2x1_2
|
|
KERNEL2x1_E2 16,32, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L2x1_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L2x1_SAVE
|
|
KERNEL2x1
|
|
|
|
MY_ALIGN
|
|
CGEMM_L2x1_SAVE:
|
|
/*----------------------------------------*/
|
|
|
|
SAVE2x1
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
|
|
#endif
|
|
|
|
|
|
CGEMM_L2x1_END:
|
|
/*----------------------------------------*/
|
|
slwi T1, K, 4
|
|
|
|
add B, B, T1
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
addi TEMP_REG, TEMP_REG, 2
|
|
#endif
|
|
|
|
CGEMM_L2_END:
|
|
|
|
|
|
b CGEMM_L1
|
|
/* MINI SUBROUTINES */
|
|
/* 1x8 MAIN 128x+2 LOOP */
|
|
|
|
|
|
CGEMM_L1x8_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD1x8_2
|
|
MY_ALIGN
|
|
CGEMM_L1x8_LOOP:
|
|
/*----------------------------------------*/
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL1x8_L2 128,16,0,0
|
|
CGEMM_L1x8_K128:
|
|
/*----------------------------------------*/
|
|
KERNEL1x8_L2 128,16,1,0
|
|
dcbt AO, T2
|
|
KERNEL1x8_L2 128,16,2,0
|
|
KERNEL1x8_L2 128,16,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL1x8_L2 128,16,4,0
|
|
KERNEL1x8_L2 128,16,5,0
|
|
dcbt AO, T4
|
|
KERNEL1x8_L2 128,16,6,0
|
|
KERNEL1x8_L2 128,16,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL1x8_L2 128,16,8,0
|
|
KERNEL1x8_L2 128,16,9,0
|
|
KERNEL1x8_L2 128,16,10,0
|
|
KERNEL1x8_L2 128,16,11,0
|
|
dcbt BO, T4
|
|
KERNEL1x8_L2 128,16,12,0
|
|
KERNEL1x8_L2 128,16,13,0
|
|
KERNEL1x8_L2 128,16,14,0
|
|
KERNEL1x8_L2 128,16,15,0
|
|
KERNEL1x8_L2 128,16,16,0
|
|
KERNEL1x8_L2 128,16,17,0
|
|
KERNEL1x8_L2 128,16,18,0
|
|
KERNEL1x8_L2 128,16,19,0
|
|
KERNEL1x8_L2 128,16,20,0
|
|
KERNEL1x8_L2 128,16,21,0
|
|
KERNEL1x8_L2 128,16,22,0
|
|
KERNEL1x8_L2 128,16,23,0
|
|
KERNEL1x8_L2 128,16,24,0
|
|
KERNEL1x8_L2 128,16,25,0
|
|
KERNEL1x8_L2 128,16,26,0
|
|
KERNEL1x8_L2 128,16,27,0
|
|
KERNEL1x8_L2 128,16,28,0
|
|
KERNEL1x8_L2 128,16,29,0
|
|
KERNEL1x8_L2 128,16,30,0
|
|
KERNEL1x8_L2 128,16,31,0
|
|
KERNEL1x8_L2 128,16,32,0
|
|
KERNEL1x8_L2 128,16,33,0
|
|
KERNEL1x8_L2 128,16,34,0
|
|
KERNEL1x8_L2 128,16,35,0
|
|
KERNEL1x8_L2 128,16,36,0
|
|
KERNEL1x8_L2 128,16,37,0
|
|
KERNEL1x8_L2 128,16,38,0
|
|
KERNEL1x8_L2 128,16,39,0
|
|
KERNEL1x8_L2 128,16,40,0
|
|
KERNEL1x8_L2 128,16,41,0
|
|
KERNEL1x8_L2 128,16,42,0
|
|
KERNEL1x8_L2 128,16,43,0
|
|
KERNEL1x8_L2 128,16,44,0
|
|
KERNEL1x8_L2 128,16,45,0
|
|
KERNEL1x8_L2 128,16,46,0
|
|
KERNEL1x8_L2 128,16,47,0
|
|
KERNEL1x8_L2 128,16,48,0
|
|
KERNEL1x8_L2 128,16,49,0
|
|
KERNEL1x8_L2 128,16,50,0
|
|
KERNEL1x8_L2 128,16,51,0
|
|
KERNEL1x8_L2 128,16,52,0
|
|
KERNEL1x8_L2 128,16,53,0
|
|
KERNEL1x8_L2 128,16,54,0
|
|
KERNEL1x8_L2 128,16,55,0
|
|
KERNEL1x8_L2 128,16,56,0
|
|
KERNEL1x8_L2 128,16,57,0
|
|
KERNEL1x8_L2 128,16,58,0
|
|
KERNEL1x8_L2 128,16,59,0
|
|
KERNEL1x8_L2 128,16,60,0
|
|
KERNEL1x8_L2 128,16,61,0
|
|
KERNEL1x8_L2 128,16,62,0
|
|
KERNEL1x8_L2 128,16,63,1
|
|
bdnz CGEMM_L1x8_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L1x8_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END1x8_2
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_1x8_L64_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL1x8_L2 128,16,0,0
|
|
KERNEL1x8_L2 128,16,1,0
|
|
dcbt AO, T2
|
|
KERNEL1x8_L2 128,16,2,0
|
|
KERNEL1x8_L2 128,16,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL1x8_L2 128,16,4,0
|
|
KERNEL1x8_L2 128,16,5,0
|
|
dcbt AO, T4
|
|
KERNEL1x8_L2 128,16,6,0
|
|
KERNEL1x8_L2 128,16,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL1x8_L2 128,16,8,0
|
|
KERNEL1x8_L2 128,16,9,0
|
|
KERNEL1x8_L2 128,16,10,0
|
|
KERNEL1x8_L2 128,16,11,0
|
|
dcbt BO, T4
|
|
KERNEL1x8_L2 128,16,12,0
|
|
KERNEL1x8_L2 128,16,13,0
|
|
KERNEL1x8_L2 128,16,14,0
|
|
KERNEL1x8_L2 128,16,15,0
|
|
KERNEL1x8_L2 128,16,16,0
|
|
KERNEL1x8_L2 128,16,17,0
|
|
KERNEL1x8_L2 128,16,18,0
|
|
KERNEL1x8_L2 128,16,19,0
|
|
KERNEL1x8_L2 128,16,20,0
|
|
KERNEL1x8_L2 128,16,21,0
|
|
KERNEL1x8_L2 128,16,22,0
|
|
KERNEL1x8_L2 128,16,23,0
|
|
KERNEL1x8_L2 128,16,24,0
|
|
KERNEL1x8_L2 128,16,25,0
|
|
KERNEL1x8_L2 128,16,26,0
|
|
KERNEL1x8_L2 128,16,27,0
|
|
KERNEL1x8_L2 128,16,28,0
|
|
KERNEL1x8_L2 128,16,29,0
|
|
KERNEL1x8_L2 128,16,30,0
|
|
KERNEL1x8_E2 128,16,31,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_1x8_L32_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL1x8_L2 128,16,0,0
|
|
KERNEL1x8_L2 128,16,1,0
|
|
dcbt AO, T2
|
|
KERNEL1x8_L2 128,16,2,0
|
|
KERNEL1x8_L2 128,16,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL1x8_L2 128,16,4,0
|
|
KERNEL1x8_L2 128,16,5,0
|
|
dcbt AO, T4
|
|
KERNEL1x8_L2 128,16,6,0
|
|
KERNEL1x8_L2 128,16,7,0
|
|
dcbt AO, T5
|
|
dcbt BO, T3
|
|
KERNEL1x8_L2 128,16,8,0
|
|
KERNEL1x8_L2 128,16,9,0
|
|
KERNEL1x8_L2 128,16,10,0
|
|
KERNEL1x8_L2 128,16,11,0
|
|
dcbt BO, T4
|
|
KERNEL1x8_L2 128,16,12,0
|
|
KERNEL1x8_L2 128,16,13,0
|
|
KERNEL1x8_L2 128,16,14,0
|
|
KERNEL1x8_E2 128,16,15,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_1x8_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x8_2
|
|
dcbt AO, PRE
|
|
dcbt BO, PRE
|
|
KERNEL1x8_L2 128,16,0,0
|
|
KERNEL1x8_L2 128,16,1,0
|
|
dcbt AO, T2
|
|
KERNEL1x8_L2 128,16,2,0
|
|
KERNEL1x8_L2 128,16,3,0
|
|
dcbt AO, T3
|
|
dcbt BO, T2
|
|
KERNEL1x8_L2 128,16,4,0
|
|
KERNEL1x8_L2 128,16,5,0
|
|
dcbt AO, T4
|
|
KERNEL1x8_L2 128,16,6,0
|
|
KERNEL1x8_E2 128,16,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_1x4_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD1x4_2
|
|
MY_ALIGN
|
|
CGEMM_L1x4_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL1x4_L2 64,16,0,0
|
|
CGEMM_L1x4_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL1x4_L2 64,16,1,0
|
|
KERNEL1x4_L2 64,16,2,0
|
|
KERNEL1x4_L2 64,16,3,0
|
|
KERNEL1x4_L2 64,16,4,0
|
|
KERNEL1x4_L2 64,16,5,0
|
|
KERNEL1x4_L2 64,16,6,0
|
|
KERNEL1x4_L2 64,16,7,0
|
|
KERNEL1x4_L2 64,16,8,0
|
|
KERNEL1x4_L2 64,16,9,0
|
|
KERNEL1x4_L2 64,16,10,0
|
|
KERNEL1x4_L2 64,16,11,0
|
|
KERNEL1x4_L2 64,16,12,0
|
|
KERNEL1x4_L2 64,16,13,0
|
|
KERNEL1x4_L2 64,16,14,0
|
|
KERNEL1x4_L2 64,16,15,1
|
|
bdnz CGEMM_L1x4_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L1x4_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END1x4_2
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_1x4_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x4_2
|
|
KERNEL1x4_L2 64,16,0,0
|
|
KERNEL1x4_L2 64,16,1,0
|
|
KERNEL1x4_L2 64,16,2,0
|
|
KERNEL1x4_L2 64,16,3,0
|
|
KERNEL1x4_L2 64,16,4,0
|
|
KERNEL1x4_L2 64,16,5,0
|
|
KERNEL1x4_L2 64,16,6,0
|
|
KERNEL1x4_E2 64,16,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_1x4_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x4_2
|
|
KERNEL1x4_L2 64,16,0,0
|
|
KERNEL1x4_L2 64,16,1,0
|
|
KERNEL1x4_L2 64,16,2,0
|
|
KERNEL1x4_E2 64,16,3,1
|
|
blr
|
|
|
|
|
|
CGEMM_1x2_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD1x2_2
|
|
MY_ALIGN
|
|
CGEMM_L1x2_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL1x2_L2 32,16,0,0
|
|
CGEMM_L1x2_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL1x2_L2 32,16,1,0
|
|
KERNEL1x2_L2 32,16,2,0
|
|
KERNEL1x2_L2 32,16,3,0
|
|
KERNEL1x2_L2 32,16,4,0
|
|
KERNEL1x2_L2 32,16,5,0
|
|
KERNEL1x2_L2 32,16,6,0
|
|
KERNEL1x2_L2 32,16,7,0
|
|
KERNEL1x2_L2 32,16,8,0
|
|
KERNEL1x2_L2 32,16,9,0
|
|
KERNEL1x2_L2 32,16,10,0
|
|
KERNEL1x2_L2 32,16,11,0
|
|
KERNEL1x2_L2 32,16,12,0
|
|
KERNEL1x2_L2 32,16,13,0
|
|
KERNEL1x2_L2 32,16,14,0
|
|
KERNEL1x2_L2 32,16,15,1
|
|
bdnz CGEMM_L1x2_LOOP
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x2_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END1x2_2
|
|
blr
|
|
MY_ALIGN
|
|
CGEMM_1x2_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x2_2
|
|
KERNEL1x2_L2 32,16,0,0
|
|
KERNEL1x2_L2 32,16,1,0
|
|
KERNEL1x2_L2 32,16,2,0
|
|
KERNEL1x2_L2 32,16,3,0
|
|
KERNEL1x2_L2 32,16,4,0
|
|
KERNEL1x2_L2 32,16,5,0
|
|
KERNEL1x2_L2 32,16,6,0
|
|
KERNEL1x2_E2 32,16,7,1
|
|
blr
|
|
MY_ALIGN
|
|
CGEMM_1x2_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x2_2
|
|
KERNEL1x2_L2 32,16,0,0
|
|
KERNEL1x2_L2 32,16,1,0
|
|
KERNEL1x2_L2 32,16,2,0
|
|
KERNEL1x2_E2 32,16,3,1
|
|
blr
|
|
|
|
|
|
CGEMM_1x1_LMAIN_SUB:
|
|
/*----------------------------------------*/
|
|
mtctr T8
|
|
LOAD1x1_2
|
|
MY_ALIGN
|
|
CGEMM_L1x1_LOOP:
|
|
/*----------------------------------------*/
|
|
KERNEL1x1_L2 16,16,0,0
|
|
CGEMM_L1x1_K32:
|
|
/*----------------------------------------*/
|
|
KERNEL1x1_L2 16,16,1,0
|
|
KERNEL1x1_L2 16,16,2,0
|
|
KERNEL1x1_L2 16,16,3,0
|
|
KERNEL1x1_L2 16,16,4,0
|
|
KERNEL1x1_L2 16,16,5,0
|
|
KERNEL1x1_L2 16,16,6,0
|
|
KERNEL1x1_L2 16,16,7,0
|
|
KERNEL1x1_L2 16,16,8,0
|
|
KERNEL1x1_L2 16,16,9,0
|
|
KERNEL1x1_L2 16,16,10,0
|
|
KERNEL1x1_L2 16,16,11,0
|
|
KERNEL1x1_L2 16,16,12,0
|
|
KERNEL1x1_L2 16,16,13,0
|
|
KERNEL1x1_L2 16,16,14,0
|
|
KERNEL1x1_L2 16,16,15,1
|
|
bdnz CGEMM_L1x1_LOOP
|
|
MY_ALIGN
|
|
CGEMM_L1x1_LOOP_END:
|
|
/*----------------------------------------*/
|
|
END1x1_2
|
|
blr
|
|
|
|
MY_ALIGN
|
|
CGEMM_1x1_L16_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x1_2
|
|
KERNEL1x1_L2 16,16,0,0
|
|
KERNEL1x1_L2 16,16,1,0
|
|
KERNEL1x1_L2 16,16,2,0
|
|
KERNEL1x1_L2 16,16,3,0
|
|
KERNEL1x1_L2 16,16,4,0
|
|
KERNEL1x1_L2 16,16,5,0
|
|
KERNEL1x1_L2 16,16,6,0
|
|
KERNEL1x1_E2 16,16,7,1
|
|
blr
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_1x1_L8_SUB:
|
|
/*----------------------------------------*/
|
|
LOAD1x1_2
|
|
KERNEL1x1_L2 16,16,0,0
|
|
KERNEL1x1_L2 16,16,1,0
|
|
KERNEL1x1_L2 16,16,2,0
|
|
KERNEL1x1_E2 16,16,3,1
|
|
blr
|
|
|
|
|
|
|
|
/* MAIN LOOP BEGINS */
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1:
|
|
/*----------------------------------------*/
|
|
|
|
andi. J, N, 1
|
|
ble CGEMM_L1_END
|
|
|
|
CGEMM_L1_BEGIN:
|
|
/*----------------------------------------*/
|
|
mr CO, C
|
|
add T2,C,LDC
|
|
mr AO, A
|
|
add C, C, T1
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
mr TEMP_REG, OFFSET /*off = offset;*/
|
|
#endif
|
|
srawi. I, M, 3
|
|
ble CGEMM_L1x8_END
|
|
dcbt CO,r0 /*just prefetch*/
|
|
dcbt T2,r0
|
|
|
|
|
|
CGEMM_L1x8_BEGIN:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
|
|
#else
|
|
mr BO, B
|
|
dcbt B, r0
|
|
#endif
|
|
dcbt AO, r0
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
|
|
mr T1, T6
|
|
/* TEMPS FOR PREFETCH */
|
|
li T2, 1024
|
|
li T3, 1024+512
|
|
addi T1,T1, -2
|
|
/* TEMPS FOR PREFETCH */
|
|
li T4, 2048
|
|
li T5, 2048+512
|
|
srawi. T8, T1, 7 /**(T1-2) % 128x */
|
|
#else
|
|
mr T1, K
|
|
/* TEMPS FOR PREFETCH */
|
|
li T2, 1024
|
|
li T3, 1024+512
|
|
addi T1,T1, -2
|
|
/* TEMPS FOR PREFETCH */
|
|
li T4, 2048
|
|
li T5, 2048+512
|
|
srawi. T8, T1, 7 /**(K-2) % 128x */
|
|
#endif
|
|
ZERO1x8
|
|
ble CGEMM_L1x8_SUB0
|
|
bl CGEMM_L1x8_LMAIN_SUB
|
|
andi. L, T1, 127
|
|
ble CGEMM_L1x8_SAVE
|
|
b CGEMM_L1x8_SUB2
|
|
|
|
|
|
CGEMM_L1x8_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 255
|
|
cmpwi T6,129
|
|
#else
|
|
andi. L, K, 255
|
|
cmpwi K,129
|
|
#endif
|
|
li T8,1
|
|
bne CMP1x8_128K
|
|
addi BO,BO,-8
|
|
addi AO,AO,-64
|
|
LOAD1x8O 64,8
|
|
END1x8_WITHOUT_ADD
|
|
LOAD1x8_2O 128, 16
|
|
mtctr T8
|
|
bl CGEMM_L1x8_K128
|
|
b CGEMM_L1x8_SAVE
|
|
CMP1x8_128K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,128
|
|
#else
|
|
cmpwi K,128
|
|
#endif
|
|
bne CGEMM_L1x8_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-16
|
|
addi AO,AO,-128
|
|
LOAD1x8_2O 128,16
|
|
bl CGEMM_L1x8_K128
|
|
b CGEMM_L1x8_SAVE
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 64
|
|
ble CGEMM_L1x8_SUB2_32
|
|
bl CGEMM_1x8_L64_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_SUB2_32:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 32
|
|
ble CGEMM_L1x8_SUB2_16
|
|
bl CGEMM_1x8_L32_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_SUB2_16:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L1x8_SUB2_8
|
|
bl CGEMM_1x8_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L1x8_SUB2_4
|
|
LOAD1x8_2
|
|
KERNEL1x8_L2 128,16, 0,0
|
|
KERNEL1x8_L2 128,16, 1,0
|
|
KERNEL1x8_L2 128,16, 2,0
|
|
KERNEL1x8_E2 128,16, 3,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L1x8_SUB2_2
|
|
LOAD1x8_2
|
|
KERNEL1x8_L2 128,16, 0,0
|
|
KERNEL1x8_E2 128,16, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L1x8_SUB2_1
|
|
LOAD1x8_2
|
|
KERNEL1x8_E2 128,16, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L1x8_SAVE
|
|
KERNEL1x8
|
|
|
|
MY_ALIGN
|
|
CGEMM_L1x8_SAVE:
|
|
/*----------------------------------------*/
|
|
addic. I, I, -1
|
|
MY_ALIGN
|
|
SAVE1x8
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
|
|
#endif
|
|
bgt CGEMM_L1x8_BEGIN
|
|
andi. T2, M, 7
|
|
ble CGEMM_L1x1_END
|
|
andi. T1, M, 4
|
|
ble CGEMM_L1x4_END
|
|
b CGEMM_L1x4_BEGIN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x8_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L1x4_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T2, M, 7
|
|
ble CGEMM_L1x1_END
|
|
andi. T1, M, 4
|
|
ble CGEMM_L1x4_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 31x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 31x */
|
|
#endif
|
|
ZERO1x4
|
|
ble CGEMM_L1x4_SUB0
|
|
bl CGEMM_1x4_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L1x4_SAVE
|
|
b CGEMM_L1x4_SUB2
|
|
|
|
|
|
CGEMM_L1x4_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP1x4_32K
|
|
addi BO,BO,-8
|
|
addi AO,AO,-32
|
|
LOAD1x4O 32,8
|
|
END1x4_WITHOUT_ADD
|
|
LOAD1x4_2O 64, 16
|
|
mtctr T8
|
|
bl CGEMM_L1x4_K32
|
|
b CGEMM_L1x4_SAVE
|
|
CMP1x4_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L1x4_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-16
|
|
addi AO,AO,-64
|
|
LOAD1x4_2O 64,16
|
|
bl CGEMM_L1x4_K32
|
|
b CGEMM_L1x4_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x4_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L1x4_SUB2_8
|
|
bl CGEMM_1x4_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x4_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L1x4_SUB2_4
|
|
bl CGEMM_1x4_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x4_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L1x4_SUB2_2
|
|
LOAD1x4_2
|
|
KERNEL1x4_L2 64,16, 0,0
|
|
KERNEL1x4_E2 64,16, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x4_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L1x4_SUB2_1
|
|
LOAD1x4_2
|
|
KERNEL1x4_E2 64,16, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x4_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L1x4_SAVE
|
|
KERNEL1x4
|
|
|
|
|
|
CGEMM_L1x4_SAVE:
|
|
/*----------------------------------------*/
|
|
SAVE1x4
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
|
|
#endif
|
|
|
|
|
|
CGEMM_L1x4_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L1x2_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T1, M, 2
|
|
ble CGEMM_L1x2_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 31x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 31x */
|
|
#endif
|
|
ZERO1x2
|
|
ble CGEMM_L1x2_SUB0
|
|
bl CGEMM_1x2_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L1x2_SAVE
|
|
b CGEMM_L1x2_SUB2
|
|
|
|
|
|
CGEMM_L1x2_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP1x2_32K
|
|
addi BO,BO,-8
|
|
addi AO,AO,-16
|
|
LOAD1x2O 16,8
|
|
END1x2_WITHOUT_ADD
|
|
LOAD1x2_2O 32, 16
|
|
mtctr T8
|
|
bl CGEMM_L1x2_K32
|
|
b CGEMM_L1x2_SAVE
|
|
CMP1x2_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L1x2_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-16
|
|
addi AO,AO,-32
|
|
LOAD1x2_2O 32,16
|
|
bl CGEMM_L1x2_K32
|
|
b CGEMM_L1x2_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x2_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L1x2_SUB2_8
|
|
bl CGEMM_1x2_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x2_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L1x2_SUB2_4
|
|
bl CGEMM_1x2_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x2_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L1x2_SUB2_2
|
|
LOAD1x2_2
|
|
KERNEL1x2_L2 32,16, 0,0
|
|
KERNEL1x2_E2 32,16, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x2_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L1x2_SUB2_1
|
|
LOAD1x2_2
|
|
KERNEL1x2_E2 32,16, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x2_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L1x2_SAVE
|
|
KERNEL1x2
|
|
|
|
MY_ALIGN
|
|
CGEMM_L1x2_SAVE:
|
|
/*----------------------------------------*/
|
|
SAVE1x2
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
|
|
#endif
|
|
|
|
|
|
CGEMM_L1x2_END:
|
|
/*----------------------------------------*/
|
|
|
|
|
|
CGEMM_L1x1_BEGIN:
|
|
/*----------------------------------------*/
|
|
andi. T1, M, 1
|
|
ble CGEMM_L1x1_END
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
|
|
#else
|
|
mr BO, B
|
|
#endif
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
|
|
mr T1, T6
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(T1-2) % 31x */
|
|
#else
|
|
mr T1, K
|
|
addi T1,T1, -2
|
|
srawi. T8, T1, 5 /**(K-2) % 31x */
|
|
#endif
|
|
ZERO1x1
|
|
ble CGEMM_L1x1_SUB0
|
|
bl CGEMM_1x1_LMAIN_SUB
|
|
andi. L, T1, 31
|
|
ble CGEMM_L1x1_SAVE
|
|
b CGEMM_L1x1_SUB2
|
|
|
|
|
|
CGEMM_L1x1_SUB0:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
andi. L, T6, 63
|
|
cmpwi T6,33
|
|
#else
|
|
andi. L, K, 63
|
|
cmpwi K,33
|
|
#endif
|
|
li T8,1
|
|
bne CMP1x1_32K
|
|
addi BO,BO,-8
|
|
addi AO,AO,-8
|
|
LOAD1x1O 8,8
|
|
END1x1_WITHOUT_ADD
|
|
LOAD1x1_2O 16, 16
|
|
mtctr T8
|
|
bl CGEMM_L1x1_K32
|
|
b CGEMM_L1x1_SAVE
|
|
CMP1x1_32K:
|
|
/*----------------------------------------*/
|
|
#if defined(TRMMKERNEL)
|
|
cmpwi T6,32
|
|
#else
|
|
cmpwi K,32
|
|
#endif
|
|
bne CGEMM_L1x1_SUB2
|
|
MY_ALIGN
|
|
mtctr T8
|
|
addi BO,BO,-16
|
|
addi AO,AO,-16
|
|
LOAD1x1_2O 16,16
|
|
bl CGEMM_L1x1_K32
|
|
b CGEMM_L1x1_SAVE
|
|
MY_ALIGN
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x1_SUB2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 16
|
|
ble CGEMM_L1x1_SUB2_8
|
|
bl CGEMM_1x1_L16_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x1_SUB2_8:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 8
|
|
ble CGEMM_L1x1_SUB2_4
|
|
bl CGEMM_1x1_L8_SUB
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x1_SUB2_4:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 4
|
|
ble CGEMM_L1x1_SUB2_2
|
|
LOAD1x1_2
|
|
KERNEL1x1_L2 16,16, 0,0
|
|
KERNEL1x1_E2 16,16, 1,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x1_SUB2_2:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 2
|
|
ble CGEMM_L1x1_SUB2_1
|
|
LOAD1x1_2
|
|
KERNEL1x1_E2 16,16, 0,1
|
|
MY_ALIGN
|
|
|
|
|
|
CGEMM_L1x1_SUB2_1:
|
|
/*----------------------------------------*/
|
|
andi. T1,L, 1
|
|
ble CGEMM_L1x1_SAVE
|
|
KERNEL1x1
|
|
|
|
MY_ALIGN
|
|
CGEMM_L1x1_SAVE:
|
|
/*----------------------------------------*/
|
|
|
|
SAVE1x1
|
|
#if defined(TRMMKERNEL)
|
|
REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
|
|
#endif
|
|
|
|
|
|
CGEMM_L1x1_END:
|
|
/*----------------------------------------*/
|
|
slwi T1, K, 3
|
|
|
|
add B, B, T1
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
addi TEMP_REG, TEMP_REG, 1
|
|
#endif
|
|
|
|
CGEMM_L1_END:
|
|
|
|
|
|
|
|
|