Optimizations for APM's xgene-1 (aarch64). 1) general system updates to support armv8 better. Make all did not work, one needed to supply TARGET=ARMV8. 2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it. 3) strmm 4x4 kernel in C. Since the sgem kernel does 4x4, the trmm kernel must also do 4xN. Added Dave Nuechterlein to the contributors list.
1328 lines
23 KiB
ArmAsm
1328 lines
23 KiB
ArmAsm
/***************************************************************************
|
|
Copyright (c) 2013, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*****************************************************************************/
|
|
|
|
/**************************************************************************************
|
|
* 2013/11/23 Saar
|
|
* BLASTEST : OK
|
|
* CTEST : OK
|
|
* TEST : OK
|
|
*
|
|
*
|
|
* 2013/11/02 Saar
|
|
* UNROLL_N 4
|
|
* UNROLL_M 4
|
|
* DGEMM_P 128
|
|
* DGEMM_Q 240
|
|
* DGEMM_R 12288
|
|
* A_PRE 128
|
|
* B_PRE 128
|
|
* C_PRE 32
|
|
*
|
|
* Performance on Odroid U2:
|
|
*
|
|
* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
|
|
* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
|
|
* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
|
|
* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
|
|
**************************************************************************************/
|
|
|
|
#define ASSEMBLER
|
|
#include "common.h"
|
|
|
|
/* X0 X1 X2 s0 X3 x4 x5 x6*/
|
|
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/
|
|
|
|
|
|
#define origM x0
|
|
#define origN x1
|
|
#define origK x2
|
|
#define origPA x3
|
|
#define origPB x4
|
|
#define pC x5
|
|
#define LDC x6
|
|
#define offset x7
|
|
#define counterL x8
|
|
#define counterI x9
|
|
#define pB x10
|
|
#define counterJ x11
|
|
#define tempALPHA x12
|
|
#define pCRow0 x13
|
|
#define pCRow1 x14
|
|
#define pCRow2 x15
|
|
#define pA x16
|
|
|
|
// 00 origM
|
|
// 01 origN
|
|
// 02 origK
|
|
// 03 origPA
|
|
// 04 origPB
|
|
// 05 pC
|
|
// 06 origLDC -> LDC
|
|
// 07 offset
|
|
// 08 counterL
|
|
// 09 counterI
|
|
// 10 pB
|
|
// 11 counterJ
|
|
// 12 tempALPHA
|
|
// 13 pCRow0
|
|
// 14 pCRow1
|
|
// 15 pCRow2
|
|
// 16 pA
|
|
// 17
|
|
// 18 must save
|
|
// 19 must save
|
|
// 20 must save
|
|
// 21 must save
|
|
// 22 must save
|
|
// 23 must save
|
|
// 24 must save
|
|
// 25 must save
|
|
// 26 must save
|
|
// 27 must save
|
|
// 28 must save
|
|
// 29 frame
|
|
// 30 link
|
|
// 31 sp
|
|
|
|
//v00 orig ALPHA -> a00
|
|
//v01 a01
|
|
//v02 a02
|
|
//v03 a03
|
|
//v04 a10
|
|
//v05 a11
|
|
//v06 a12
|
|
//v07 a13
|
|
//v08 must save b00
|
|
//v09 must save b01
|
|
//v10 must save b02
|
|
//v11 must save b03
|
|
//v12 must save b10
|
|
//v13 must save b11
|
|
//v14 must save b12
|
|
//v15 must save b13
|
|
//v16 must save C00
|
|
//v17 must save C01
|
|
//v18 C02
|
|
//v19 C03
|
|
//v20 C10
|
|
//v21 C11
|
|
//v22 C12
|
|
//v23 C13
|
|
//v24 C20
|
|
//v25 C21
|
|
//v26 C22
|
|
//v27 C23
|
|
//v28 C30
|
|
//v29 C31
|
|
//v30 C32
|
|
//v31 C33
|
|
|
|
// add sp,sp,#-(6*16)
|
|
// stp x18,x19,[sp,#(0*16)]
|
|
// stp x20,x21,[sp,#(1*16)]
|
|
|
|
|
|
/**************************************************************************************
|
|
* Macro definitions
|
|
**************************************************************************************/
|
|
|
|
.macro INIT4x4
|
|
|
|
fsub v16.4s , v16.4s , v16.4s
|
|
fsub v20.4s , v20.4s , v20.4s
|
|
fsub v24.4s , v24.4s , v24.4s
|
|
fsub v28.4s , v28.4s , v28.4s
|
|
|
|
.endm
|
|
|
|
.macro KERNEL4x4_I
|
|
|
|
ld1 {v8.2s},[pB],#8
|
|
ld1 {v10.2s},[pB],#8
|
|
ld1 {v0.4s},[pA],#16
|
|
|
|
fmulx v16.4s, v0.4s, v8.4s[0]
|
|
fmulx v20.4s, v0.4s, v8.4s[1]
|
|
fmulx v24.4s, v0.4s, v10.4s[0]
|
|
fmulx v28.4s, v0.4s, v10.4s[1]
|
|
|
|
ld1 {v12.2s},[pB],#8 // for next round
|
|
ld1 {v14.2s},[pB],#8 // for next round
|
|
ld1 {v4.4s},[pA],#16 // for next round
|
|
|
|
|
|
.endm
|
|
|
|
|
|
.macro KERNEL4x4_M2
|
|
|
|
fmla v16.4s, v4.4s, v12.s[0]
|
|
fmla v20.4s, v4.4s, v12.s[1]
|
|
fmla v24.4s, v4.4s, v14.s[0]
|
|
fmla v28.4s, v4.4s, v14.s[1]
|
|
|
|
ld1 {v8.2s},[pB],#8
|
|
ld1 {v10.2s},[pB],#8
|
|
ld1 {v0.4s},[pA],#16
|
|
|
|
.endm
|
|
|
|
|
|
.macro KERNEL4x4_M1
|
|
|
|
fmla v16.4s, v0.4s, v8.s[0]
|
|
fmla v20.4s, v0.4s, v8.s[1]
|
|
fmla v24.4s, v0.4s, v10.s[0]
|
|
fmla v28.4s, v0.4s, v10.s[1]
|
|
|
|
ld1 {v12.2s},[pB],#8
|
|
ld1 {v14.2s},[pB],#8
|
|
ld1 {v4.4s},[pA],#16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL4x4_E
|
|
|
|
fmla v16.4s, v4.4s, v12.s[0]
|
|
fmla v20.4s, v4.4s, v12.s[1]
|
|
fmla v24.4s, v4.4s, v14.s[0]
|
|
fmla v28.4s, v4.4s, v14.s[1]
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL4x4_SUB
|
|
|
|
ld1 {v8.2s},[pB],#8
|
|
ld1 {v10.2s},[pB],#8
|
|
ld1 {v0.4s} , [pA],#16
|
|
|
|
fmla v16.4s, v0.4s, v8.s[0]
|
|
fmla v20.4s, v0.4s, v8.s[1]
|
|
fmla v24.4s, v0.4s, v10.s[0]
|
|
fmla v28.4s, v0.4s, v10.s[1]
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro SAVE4x4
|
|
|
|
add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer
|
|
mov v0.d[0], tempALPHA
|
|
|
|
ld1 {v8.4s},[pCRow0] // load 4 values of C from first row
|
|
fmla v8.4s ,v16.4s,v0.s[0]
|
|
st1 {v8.4s},[pCRow0],#16 // store C from first row
|
|
|
|
ld1 {v12.4s},[pCRow1] // load 4 values of C from second row
|
|
fmla v12.4s ,v20.4s,v0.s[0]
|
|
st1 {v12.4s},[pCRow1] // store C from second row
|
|
|
|
add pCRow2, pCRow1, LDC // Row2 points to third row
|
|
|
|
ld1 {v8.4s},[pCRow2] // load 4 values of C from third row
|
|
fmla v8.4s ,v24.4s,v0.s[0]
|
|
st1 {v8.4s} ,[pCRow2] // store C from third row
|
|
|
|
add pCRow1, pCRow2 , LDC // row1 points to fourth row
|
|
|
|
ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row
|
|
fmla v12.4s ,v28.4s,v0.s[0]
|
|
st1 {v12.4s},[pCRow1] // store fourth row
|
|
|
|
.endm
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro INIT2x4
|
|
|
|
fsub s16 , s16 , s16
|
|
fmov s17, s16
|
|
fmov s20, s16
|
|
fmov s21, s16
|
|
fmov s24, s16
|
|
fmov s25, s16
|
|
fmov s28, s16
|
|
fmov s29, s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL2x4_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
ldr s9 , [ pB, #4 ]
|
|
ldr s10, [ pB, #8 ]
|
|
ldr s11, [ pB, #12 ]
|
|
|
|
ldr s0 , [ pA ]
|
|
ldr s1 , [ pA, #4 ]
|
|
|
|
fmadd s16 , s0, s8, s16
|
|
fmadd s17 , s1, s8, s17
|
|
|
|
fmadd s20 , s0, s9, s20
|
|
fmadd s21 , s1, s9, s21
|
|
|
|
fmadd s24 , s0, s10, s24
|
|
fmadd s25 , s1, s10, s25
|
|
|
|
fmadd s28 , s0, s11, s28
|
|
fmadd s29 , s1, s11, s29
|
|
add pA , pA, #8
|
|
add pB , pB, #16
|
|
|
|
.endm
|
|
|
|
#define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1
|
|
#define L1ST( op1, op2, op3) ldr op1, [op2, op3]
|
|
|
|
.macro SAVE2x4
|
|
|
|
add pCRow1 , pCRow0, LDC
|
|
add pCRow2 , pCRow1, LDC
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0)
|
|
L1ST ( s9,pCRow0, #4 )
|
|
|
|
F1ST ( s8 , s0 , s16)
|
|
F1ST ( s9 , s0 , s17)
|
|
|
|
str s8 , [pCRow0, #0]
|
|
str s9 , [pCRow0, #4 ]
|
|
|
|
ldr s12, [pCRow1, #0]
|
|
ldr s13, [pCRow1, #4 ]
|
|
|
|
F1ST ( s12, s0 , s20)
|
|
F1ST ( s13, s0 , s21)
|
|
|
|
str s12, [pCRow1, #0]
|
|
str s13, [pCRow1, #4 ]
|
|
|
|
L1ST ( s8,pCRow2 , #0)
|
|
L1ST ( s9,pCRow2 , #4 )
|
|
|
|
F1ST ( s8 , s0 , s24)
|
|
F1ST ( s9 , s0 , s25)
|
|
|
|
str s8 , [pCRow2 , #0]
|
|
str s9 , [pCRow2 , #4 ]
|
|
|
|
add pCRow1, pCRow2 , LDC
|
|
|
|
ldr s12, [pCRow1, #0]
|
|
ldr s13, [pCRow1, #4 ]
|
|
|
|
F1ST ( s12, s0 , s28)
|
|
F1ST ( s13, s0 , s29)
|
|
|
|
str s12, [pCRow1, #0]
|
|
str s13, [pCRow1, #4 ]
|
|
|
|
add pCRow0, pCRow0, #8
|
|
|
|
.endm
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro INIT1x4
|
|
|
|
fsub s16 , s16 , s16
|
|
fmov s20, s16
|
|
fmov s24, s16
|
|
fmov s28, s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL1x4_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
ldr s9 , [ pB, #4 ]
|
|
ldr s10, [ pB, #8 ]
|
|
ldr s11, [ pB, #12 ]
|
|
|
|
ldr s0 , [ pA ]
|
|
|
|
fmadd s16 , s0, s8, s16
|
|
fmadd s20 , s0, s9, s20
|
|
fmadd s24 , s0, s10, s24
|
|
fmadd s28 , s0, s11, s28
|
|
|
|
add pA , pA, #4
|
|
add pB , pB, #16
|
|
|
|
.endm
|
|
|
|
.macro SAVE1x4
|
|
|
|
add pCRow1 , pCRow0, LDC
|
|
add pCRow2 , pCRow1, LDC
|
|
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0)
|
|
F1ST ( s8 , s0 , s16)
|
|
str s8 , [pCRow0, #0]
|
|
|
|
L1ST ( s12,pCRow1, #0)
|
|
F1ST ( s12, s0 , s20)
|
|
str s12, [pCRow1, #0]
|
|
|
|
L1ST ( s8,pCRow2 , #0)
|
|
F1ST ( s8 , s0 , s24)
|
|
str s8 , [pCRow2 , #0]
|
|
|
|
add pCRow1, pCRow2 , LDC
|
|
|
|
L1ST ( s12,pCRow1, #0)
|
|
F1ST ( s12, s0 , s28)
|
|
str s12, [pCRow1, #0]
|
|
|
|
add pCRow0, pCRow0, #4
|
|
|
|
.endm
|
|
|
|
/******************************************************************************/
|
|
/******************************************************************************/
|
|
|
|
.macro INIT4x2
|
|
|
|
fsub s16 , s16 , s16
|
|
fmov s17, s16
|
|
fmov s18, s16
|
|
fmov s19, s16
|
|
fmov s20, s16
|
|
fmov s21, s16
|
|
fmov s22, s16
|
|
fmov s23, s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL4x2_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
ldr s9 , [ pB, #4 ]
|
|
|
|
ldr s0 , [ pA ]
|
|
ldr s1 , [ pA, #4 ]
|
|
ldr s2 , [ pA, #8 ]
|
|
ldr s3 , [ pA, #12 ]
|
|
|
|
fmadd s16 , s0, s8, s16
|
|
fmadd s17 , s1, s8, s17
|
|
fmadd s18 , s2, s8, s18
|
|
fmadd s19 , s3, s8, s19
|
|
|
|
fmadd s20 , s0, s9, s20
|
|
fmadd s21 , s1, s9, s21
|
|
fmadd s22 , s2, s9, s22
|
|
fmadd s23 , s3, s9, s23
|
|
|
|
add pA , pA, #16
|
|
add pB , pB, #8
|
|
|
|
.endm
|
|
|
|
.macro SAVE4x2
|
|
|
|
add pCRow1 , pCRow0, LDC
|
|
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0)
|
|
L1ST ( s9,pCRow0, #4 )
|
|
L1ST ( s10,pCRow0, #8 )
|
|
L1ST ( s11,pCRow0, #12 )
|
|
|
|
F1ST ( s8 , s0 , s16)
|
|
F1ST ( s9 , s0 , s17)
|
|
F1ST ( s10, s0 , s18)
|
|
F1ST ( s11, s0 , s19)
|
|
|
|
str s8 , [pCRow0]
|
|
str s9 , [pCRow0, #4 ]
|
|
str s10, [pCRow0, #8 ]
|
|
str s11, [pCRow0, #12 ]
|
|
|
|
L1ST ( s12,pCRow1, #0)
|
|
L1ST ( s13,pCRow1, #4 )
|
|
L1ST ( s14,pCRow1, #8 )
|
|
L1ST ( s15,pCRow1, #12 )
|
|
|
|
F1ST ( s12, s0 , s20)
|
|
F1ST ( s13, s0 , s21)
|
|
F1ST ( s14, s0 , s22)
|
|
F1ST ( s15, s0 , s23)
|
|
|
|
str s12, [pCRow1]
|
|
str s13, [pCRow1, #4 ]
|
|
str s14, [pCRow1, #8 ]
|
|
str s15, [pCRow1, #12 ]
|
|
|
|
add pCRow0, pCRow0, #16
|
|
|
|
.endm
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro INIT2x2
|
|
|
|
fsub s16 , s16 , s16
|
|
fmov s17, s16
|
|
fmov s20, s16
|
|
fmov s21, s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL2x2_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
ldr s9 , [ pB, #4 ]
|
|
|
|
ldr s0 , [ pA ]
|
|
ldr s1 , [ pA, #4 ]
|
|
|
|
fmadd s16 , s0, s8, s16
|
|
fmadd s17 , s1, s8, s17
|
|
|
|
fmadd s20 , s0, s9, s20
|
|
fmadd s21 , s1, s9, s21
|
|
|
|
add pA , pA, #8
|
|
add pB , pB, #8
|
|
|
|
.endm
|
|
|
|
.macro SAVE2x2
|
|
|
|
add pCRow1 , pCRow0, LDC
|
|
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0 )
|
|
L1ST ( s9,pCRow0, #4 )
|
|
|
|
F1ST ( s8 , s0 , s16)
|
|
F1ST ( s9 , s0 , s17)
|
|
|
|
str s8 , [pCRow0]
|
|
str s9 , [pCRow0, #4 ]
|
|
|
|
L1ST ( s12,pCRow1, #0 )
|
|
L1ST ( s13,pCRow1, #4 )
|
|
|
|
F1ST ( s12, s0 , s20)
|
|
F1ST ( s13, s0 , s21)
|
|
|
|
str s12, [pCRow1]
|
|
str s13, [pCRow1, #4 ]
|
|
|
|
add pCRow0, pCRow0, #8
|
|
|
|
.endm
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro INIT1x2
|
|
|
|
fsub s16 , s16 , s16
|
|
fmov s20, s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL1x2_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
ldr s9 , [ pB, #4 ]
|
|
|
|
ldr s0 , [ pA ]
|
|
fmadd s16 , s0, s8, s16
|
|
fmadd s20 , s0, s9, s20
|
|
|
|
add pA , pA, #4
|
|
add pB , pB, #8
|
|
|
|
.endm
|
|
|
|
.macro SAVE1x2
|
|
|
|
add pCRow1 , pCRow0, LDC
|
|
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0)
|
|
F1ST ( s8 , s0 , s16)
|
|
str s8 , [pCRow0]
|
|
|
|
L1ST ( s12,pCRow1, #0)
|
|
F1ST ( s12, s0 , s20)
|
|
str s12, [pCRow1]
|
|
|
|
add pCRow0, pCRow0, #4
|
|
|
|
.endm
|
|
|
|
/******************************************************************************/
|
|
/******************************************************************************/
|
|
|
|
.macro INIT4x1
|
|
|
|
fsub s16 , s16 , s16
|
|
fmov s17, s16
|
|
fmov s18, s16
|
|
fmov s19, s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL4x1_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
|
|
ldr s0 , [ pA ]
|
|
ldr s1 , [ pA, #4 ]
|
|
ldr s2 , [ pA, #8 ]
|
|
ldr s3 , [ pA, #12 ]
|
|
|
|
fmadd s16 , s0, s8, s16
|
|
fmadd s17 , s1, s8, s17
|
|
fmadd s18 , s2, s8, s18
|
|
fmadd s19 , s3, s8, s19
|
|
|
|
add pA , pA, #16
|
|
add pB , pB, #4
|
|
|
|
.endm
|
|
|
|
.macro SAVE4x1
|
|
|
|
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0 )
|
|
L1ST ( s9,pCRow0, #4 )
|
|
L1ST ( s10,pCRow0, #8 )
|
|
L1ST ( s11,pCRow0, #12 )
|
|
|
|
F1ST ( s8 , s0 , s16)
|
|
F1ST ( s9 , s0 , s17)
|
|
F1ST ( s10, s0 , s18)
|
|
F1ST ( s11, s0 , s19)
|
|
|
|
str s8 , [pCRow0]
|
|
str s9 , [pCRow0, #4 ]
|
|
str s10, [pCRow0, #8 ]
|
|
str s11, [pCRow0, #12 ]
|
|
|
|
add pCRow0, pCRow0, #16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro INIT2x1
|
|
|
|
fsub s16 , s16 , s16
|
|
fmov s17, s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL2x1_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
|
|
ldr s0 , [ pA ]
|
|
ldr s1 , [ pA, #4 ]
|
|
|
|
fmadd s16 , s0, s8, s16
|
|
fmadd s17 , s1, s8, s17
|
|
|
|
add pA , pA, #8
|
|
add pB , pB, #4
|
|
|
|
.endm
|
|
|
|
.macro SAVE2x1
|
|
|
|
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0 )
|
|
L1ST ( s9,pCRow0, #4 )
|
|
|
|
F1ST ( s8 , s0 , s16)
|
|
F1ST ( s9 , s0 , s17)
|
|
|
|
str s8 , [pCRow0]
|
|
str s9 , [pCRow0, #4 ]
|
|
|
|
add pCRow0, pCRow0, #8
|
|
|
|
.endm
|
|
|
|
/******************************************************************************/
|
|
|
|
.macro INIT1x1
|
|
|
|
fsub s16 , s16 , s16
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL1x1_SUB
|
|
|
|
ldr s8 , [ pB ]
|
|
|
|
ldr s0 , [ pA ]
|
|
|
|
fmadd s16 , s0, s8, s16
|
|
|
|
add pA , pA, #4
|
|
add pB , pB, #4
|
|
|
|
.endm
|
|
|
|
.macro SAVE1x1
|
|
|
|
|
|
mov v0.d[0], tempALPHA
|
|
|
|
L1ST ( s8,pCRow0, #0 )
|
|
F1ST ( s8 , s0 , s16)
|
|
str s8 , [pCRow0]
|
|
|
|
add pCRow0, pCRow0, #4
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
|
|
/**************************************************************************************
|
|
* End of macro definitions
|
|
**************************************************************************************/
|
|
|
|
PROLOGUE
|
|
|
|
.align 5
|
|
add sp,sp,#-(5*16)
|
|
stp d8,d9,[sp,#(0*16)]
|
|
stp d10,d11,[sp,#(1*16)]
|
|
stp d12,d13,[sp,#(2*16)]
|
|
stp d14,d15,[sp,#(3*16)]
|
|
stp d16,d17,[sp,#(4*16)]
|
|
|
|
mov tempALPHA, v0.d[0]
|
|
lsl LDC, LDC, #2 // ldc = ldc * 4
|
|
|
|
mov pB, origPB
|
|
|
|
mov counterJ, origN
|
|
asr counterJ, counterJ, #2 // J = J / 4
|
|
cmp counterJ, #0
|
|
ble sgemm_kernel_L2_BEGIN
|
|
|
|
sgemm_kernel_L4_BEGIN:
|
|
|
|
mov pCRow0, pC // pCRow0 = C
|
|
add pC,pC,LDC, lsl #2
|
|
|
|
mov pA, origPA // pA = start of A array
|
|
|
|
|
|
|
|
sgemm_kernel_L4_M4_BEGIN:
|
|
|
|
mov counterI, origM
|
|
asr counterI, counterI, #2 // counterI = counterI / 4
|
|
cmp counterI, #0
|
|
ble sgemm_kernel_L4_M2_BEGIN
|
|
|
|
sgemm_kernel_L4_M4_20:
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #1 // L = K / 2
|
|
cmp counterL , #2 // is there at least 4 to do?
|
|
blt sgemm_kernel_L4_M4_32
|
|
|
|
|
|
|
|
KERNEL4x4_I //do one in the K
|
|
KERNEL4x4_M2 //do another in the K
|
|
|
|
subs counterL, counterL, #2 // subtract 2, since one is always done at the tail
|
|
ble sgemm_kernel_L4_M4_22a
|
|
.align 5
|
|
|
|
sgemm_kernel_L4_M4_22:
|
|
|
|
KERNEL4x4_M1
|
|
KERNEL4x4_M2
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L4_M4_22
|
|
|
|
sgemm_kernel_L4_M4_22a:
|
|
|
|
KERNEL4x4_M1
|
|
KERNEL4x4_E
|
|
|
|
b sgemm_kernel_L4_M4_44
|
|
|
|
sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction
|
|
|
|
tst counterL, #1
|
|
ble sgemm_kernel_L4_M4_40
|
|
|
|
KERNEL4x4_I
|
|
|
|
KERNEL4x4_E
|
|
|
|
b sgemm_kernel_L4_M4_44
|
|
|
|
|
|
sgemm_kernel_L4_M4_40:
|
|
|
|
INIT4x4
|
|
|
|
|
|
sgemm_kernel_L4_M4_44:
|
|
|
|
ands counterL , origK, #1
|
|
ble sgemm_kernel_L4_M4_100
|
|
|
|
sgemm_kernel_L4_M4_46:
|
|
|
|
KERNEL4x4_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bne sgemm_kernel_L4_M4_46
|
|
|
|
sgemm_kernel_L4_M4_100:
|
|
|
|
SAVE4x4
|
|
|
|
sgemm_kernel_L4_M4_END:
|
|
|
|
subs counterI, counterI, #1
|
|
bne sgemm_kernel_L4_M4_20
|
|
|
|
|
|
sgemm_kernel_L4_M2_BEGIN:
|
|
|
|
mov counterI, origM
|
|
tst counterI , #3
|
|
ble sgemm_kernel_L4_END
|
|
|
|
tst counterI, #2 // counterI = counterI / 2
|
|
ble sgemm_kernel_L4_M1_BEGIN
|
|
|
|
sgemm_kernel_L4_M2_20:
|
|
|
|
INIT2x4
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL , #0
|
|
ble sgemm_kernel_L4_M2_40
|
|
|
|
sgemm_kernel_L4_M2_22:
|
|
|
|
KERNEL2x4_SUB
|
|
KERNEL2x4_SUB
|
|
KERNEL2x4_SUB
|
|
KERNEL2x4_SUB
|
|
|
|
KERNEL2x4_SUB
|
|
KERNEL2x4_SUB
|
|
KERNEL2x4_SUB
|
|
KERNEL2x4_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L4_M2_22
|
|
|
|
|
|
sgemm_kernel_L4_M2_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L4_M2_100
|
|
|
|
sgemm_kernel_L4_M2_42:
|
|
|
|
KERNEL2x4_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L4_M2_42
|
|
|
|
sgemm_kernel_L4_M2_100:
|
|
|
|
SAVE2x4
|
|
|
|
sgemm_kernel_L4_M2_END:
|
|
|
|
|
|
sgemm_kernel_L4_M1_BEGIN:
|
|
|
|
tst counterI, #1 // counterI = counterI % 2
|
|
ble sgemm_kernel_L4_END
|
|
|
|
sgemm_kernel_L4_M1_20:
|
|
|
|
INIT1x4
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL , #0
|
|
ble sgemm_kernel_L4_M1_40
|
|
|
|
sgemm_kernel_L4_M1_22:
|
|
KERNEL1x4_SUB
|
|
KERNEL1x4_SUB
|
|
KERNEL1x4_SUB
|
|
KERNEL1x4_SUB
|
|
|
|
KERNEL1x4_SUB
|
|
KERNEL1x4_SUB
|
|
KERNEL1x4_SUB
|
|
KERNEL1x4_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L4_M1_22
|
|
|
|
|
|
sgemm_kernel_L4_M1_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L4_M1_100
|
|
|
|
sgemm_kernel_L4_M1_42:
|
|
|
|
KERNEL1x4_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L4_M1_42
|
|
|
|
sgemm_kernel_L4_M1_100:
|
|
|
|
SAVE1x4
|
|
|
|
|
|
sgemm_kernel_L4_END:
|
|
|
|
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
|
|
|
|
subs counterJ, counterJ , #1 // j--
|
|
bgt sgemm_kernel_L4_BEGIN
|
|
|
|
|
|
|
|
/*********************************************************************************************/
|
|
|
|
sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
|
|
|
|
mov counterJ , origN
|
|
tst counterJ , #3
|
|
ble sgemm_kernel_L999 // error, N was less than 4?
|
|
|
|
tst counterJ , #2
|
|
ble sgemm_kernel_L1_BEGIN
|
|
|
|
mov pCRow0, pC // pCRow0 = pC
|
|
add pC , pC, LDC, lsl #1
|
|
|
|
mov pA, origPA // pA = A
|
|
|
|
|
|
|
|
sgemm_kernel_L2_M4_BEGIN:
|
|
|
|
mov counterI, origM
|
|
asr counterI, counterI, #2 // counterI = counterI / 4
|
|
cmp counterI,#0
|
|
ble sgemm_kernel_L2_M2_BEGIN
|
|
|
|
sgemm_kernel_L2_M4_20:
|
|
|
|
INIT4x2
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL,#0
|
|
ble sgemm_kernel_L2_M4_40
|
|
.align 5
|
|
|
|
sgemm_kernel_L2_M4_22:
|
|
KERNEL4x2_SUB
|
|
KERNEL4x2_SUB
|
|
KERNEL4x2_SUB
|
|
KERNEL4x2_SUB
|
|
|
|
KERNEL4x2_SUB
|
|
KERNEL4x2_SUB
|
|
KERNEL4x2_SUB
|
|
KERNEL4x2_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L2_M4_22
|
|
|
|
|
|
sgemm_kernel_L2_M4_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L2_M4_100
|
|
|
|
sgemm_kernel_L2_M4_42:
|
|
|
|
KERNEL4x2_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L2_M4_42
|
|
|
|
sgemm_kernel_L2_M4_100:
|
|
|
|
SAVE4x2
|
|
|
|
sgemm_kernel_L2_M4_END:
|
|
|
|
subs counterI, counterI, #1
|
|
bgt sgemm_kernel_L2_M4_20
|
|
|
|
|
|
sgemm_kernel_L2_M2_BEGIN:
|
|
|
|
mov counterI, origM
|
|
tst counterI , #3
|
|
ble sgemm_kernel_L2_END
|
|
|
|
tst counterI, #2 // counterI = counterI / 2
|
|
ble sgemm_kernel_L2_M1_BEGIN
|
|
|
|
sgemm_kernel_L2_M2_20:
|
|
|
|
INIT2x2
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL,#0
|
|
ble sgemm_kernel_L2_M2_40
|
|
|
|
sgemm_kernel_L2_M2_22:
|
|
|
|
KERNEL2x2_SUB
|
|
KERNEL2x2_SUB
|
|
KERNEL2x2_SUB
|
|
KERNEL2x2_SUB
|
|
|
|
KERNEL2x2_SUB
|
|
KERNEL2x2_SUB
|
|
KERNEL2x2_SUB
|
|
KERNEL2x2_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L2_M2_22
|
|
|
|
|
|
sgemm_kernel_L2_M2_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L2_M2_100
|
|
|
|
sgemm_kernel_L2_M2_42:
|
|
|
|
KERNEL2x2_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L2_M2_42
|
|
|
|
sgemm_kernel_L2_M2_100:
|
|
|
|
SAVE2x2
|
|
|
|
sgemm_kernel_L2_M2_END:
|
|
|
|
|
|
sgemm_kernel_L2_M1_BEGIN:
|
|
|
|
tst counterI, #1 // counterI = counterI % 2
|
|
ble sgemm_kernel_L2_END
|
|
|
|
sgemm_kernel_L2_M1_20:
|
|
|
|
INIT1x2
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL, #0
|
|
ble sgemm_kernel_L2_M1_40
|
|
|
|
sgemm_kernel_L2_M1_22:
|
|
KERNEL1x2_SUB
|
|
KERNEL1x2_SUB
|
|
KERNEL1x2_SUB
|
|
KERNEL1x2_SUB
|
|
|
|
KERNEL1x2_SUB
|
|
KERNEL1x2_SUB
|
|
KERNEL1x2_SUB
|
|
KERNEL1x2_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L2_M1_22
|
|
|
|
|
|
sgemm_kernel_L2_M1_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L2_M1_100
|
|
|
|
sgemm_kernel_L2_M1_42:
|
|
|
|
KERNEL1x2_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L2_M1_42
|
|
|
|
sgemm_kernel_L2_M1_100:
|
|
|
|
SAVE1x2
|
|
|
|
|
|
sgemm_kernel_L2_END:
|
|
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
|
|
|
|
/*********************************************************************************************/
|
|
|
|
sgemm_kernel_L1_BEGIN:
|
|
|
|
mov counterJ , origN
|
|
tst counterJ , #1
|
|
ble sgemm_kernel_L999 // done
|
|
|
|
|
|
mov pCRow0, pC // pCRow0 = C
|
|
add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next
|
|
|
|
mov pA, origPA // pA = A
|
|
|
|
|
|
|
|
sgemm_kernel_L1_M4_BEGIN:
|
|
|
|
mov counterI, origM
|
|
asr counterI, counterI, #2 // counterI = counterI / 4
|
|
cmp counterI, #0
|
|
ble sgemm_kernel_L1_M2_BEGIN
|
|
|
|
sgemm_kernel_L1_M4_20:
|
|
|
|
INIT4x1
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL , #0
|
|
ble sgemm_kernel_L1_M4_40
|
|
.align 5
|
|
|
|
sgemm_kernel_L1_M4_22:
|
|
KERNEL4x1_SUB
|
|
KERNEL4x1_SUB
|
|
KERNEL4x1_SUB
|
|
KERNEL4x1_SUB
|
|
|
|
KERNEL4x1_SUB
|
|
KERNEL4x1_SUB
|
|
KERNEL4x1_SUB
|
|
KERNEL4x1_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L1_M4_22
|
|
|
|
|
|
sgemm_kernel_L1_M4_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L1_M4_100
|
|
|
|
sgemm_kernel_L1_M4_42:
|
|
|
|
KERNEL4x1_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L1_M4_42
|
|
|
|
sgemm_kernel_L1_M4_100:
|
|
|
|
SAVE4x1
|
|
|
|
sgemm_kernel_L1_M4_END:
|
|
|
|
subs counterI, counterI, #1
|
|
bgt sgemm_kernel_L1_M4_20
|
|
|
|
|
|
sgemm_kernel_L1_M2_BEGIN:
|
|
|
|
mov counterI, origM
|
|
tst counterI , #3
|
|
ble sgemm_kernel_L1_END
|
|
|
|
tst counterI, #2 // counterI = counterI / 2
|
|
ble sgemm_kernel_L1_M1_BEGIN
|
|
|
|
sgemm_kernel_L1_M2_20:
|
|
|
|
INIT2x1
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL , #0
|
|
ble sgemm_kernel_L1_M2_40
|
|
|
|
sgemm_kernel_L1_M2_22:
|
|
|
|
KERNEL2x1_SUB
|
|
KERNEL2x1_SUB
|
|
KERNEL2x1_SUB
|
|
KERNEL2x1_SUB
|
|
|
|
KERNEL2x1_SUB
|
|
KERNEL2x1_SUB
|
|
KERNEL2x1_SUB
|
|
KERNEL2x1_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L1_M2_22
|
|
|
|
|
|
sgemm_kernel_L1_M2_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L1_M2_100
|
|
|
|
sgemm_kernel_L1_M2_42:
|
|
|
|
KERNEL2x1_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L1_M2_42
|
|
|
|
sgemm_kernel_L1_M2_100:
|
|
|
|
SAVE2x1
|
|
|
|
sgemm_kernel_L1_M2_END:
|
|
|
|
|
|
sgemm_kernel_L1_M1_BEGIN:
|
|
|
|
tst counterI, #1 // counterI = counterI % 2
|
|
ble sgemm_kernel_L1_END
|
|
|
|
sgemm_kernel_L1_M1_20:
|
|
|
|
INIT1x1
|
|
|
|
mov pB, origPB
|
|
asr counterL , origK, #3 // counterL = counterL / 8
|
|
cmp counterL , #0
|
|
ble sgemm_kernel_L1_M1_40
|
|
|
|
sgemm_kernel_L1_M1_22:
|
|
KERNEL1x1_SUB
|
|
KERNEL1x1_SUB
|
|
KERNEL1x1_SUB
|
|
KERNEL1x1_SUB
|
|
|
|
KERNEL1x1_SUB
|
|
KERNEL1x1_SUB
|
|
KERNEL1x1_SUB
|
|
KERNEL1x1_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L1_M1_22
|
|
|
|
|
|
sgemm_kernel_L1_M1_40:
|
|
|
|
ands counterL , origK, #7 // counterL = counterL % 8
|
|
ble sgemm_kernel_L1_M1_100
|
|
|
|
sgemm_kernel_L1_M1_42:
|
|
|
|
KERNEL1x1_SUB
|
|
|
|
subs counterL, counterL, #1
|
|
bgt sgemm_kernel_L1_M1_42
|
|
|
|
sgemm_kernel_L1_M1_100:
|
|
|
|
SAVE1x1
|
|
|
|
|
|
sgemm_kernel_L1_END:
|
|
|
|
|
|
sgemm_kernel_L999:
|
|
mov x0, #0 // set return value
|
|
ldp d8,d9,[sp,#(0*16)]
|
|
ldp d10,d11,[sp,#(1*16)]
|
|
ldp d12,d13,[sp,#(2*16)]
|
|
ldp d14,d15,[sp,#(3*16)]
|
|
ldp d16,d17,[sp,#(4*16)]
|
|
add sp,sp,#(5*16)
|
|
ret
|
|
|
|
EPILOGUE
|
|
|