small optimizations on sgemm_kernel for ARMV7
This commit is contained in:
parent
b3eab8fcb7
commit
2b801a00a5
|
@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2013/10/13 Saar
|
||||
* 2013/11/02 Saar
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
* 2013/10/13 Saar
|
||||
* 2013/11/02 Saar
|
||||
* UNROLL_N 4
|
||||
* UNROLL_M 4
|
||||
* DGEMM_P 128
|
||||
* DGEMM_Q 240
|
||||
* DGEMM_R 4096
|
||||
* A_PRE 96
|
||||
* B_PRE 96
|
||||
* C_PRE 64
|
||||
* DGEMM_R 12288
|
||||
* A_PRE 128
|
||||
* B_PRE 128
|
||||
* C_PRE 32
|
||||
*
|
||||
* Performance on Odroid U2:
|
||||
* Performance on Odroid U2:
|
||||
*
|
||||
* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS
|
||||
* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS
|
||||
* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS
|
||||
* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS
|
||||
* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
|
||||
* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
|
||||
* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
|
||||
* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
|
||||
**************************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
@ -92,9 +92,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define K1 r7
|
||||
#define BC r12
|
||||
|
||||
#define A_PRE 96
|
||||
#define B_PRE 96
|
||||
#define C_PRE 64
|
||||
#define A_PRE 128
|
||||
#define B_PRE 128
|
||||
#define C_PRE 32
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
|
@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNEL4x4_I
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldmias AO!, { s0 - s1 }
|
||||
pld [ AO , #A_PRE-8 ]
|
||||
pld [ BO , #B_PRE ]
|
||||
fldmias BO!, { s8 - s9 }
|
||||
pld [ BO , #B_PRE-8 ]
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fldmias AO!, { s2 - s3 }
|
||||
|
@ -162,20 +162,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
pld [ AO , #A_PRE ]
|
||||
fmacs s16 , s4, s12
|
||||
fmacs s17 , s5, s12
|
||||
fldmias AO!, { s0 - s1 }
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fmacs s18 , s6, s12
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s19 , s7, s12
|
||||
|
||||
fmacs s20 , s4, s13
|
||||
fldmias AO!, { s2 - s3 }
|
||||
fldmias BO!, { s8 - s11 }
|
||||
fmacs s21 , s5, s13
|
||||
fmacs s22 , s6, s13
|
||||
fldmias BO!, { s8 - s9 }
|
||||
//fldmias AO!, { s2 - s3 }
|
||||
fmacs s23 , s7, s13
|
||||
|
||||
fmacs s24 , s4, s14
|
||||
fldmias BO!, { s10 - s11 }
|
||||
//fldmias BO!, { s10 - s11 }
|
||||
fmacs s25 , s5, s14
|
||||
fmacs s26 , s6, s14
|
||||
fmacs s27 , s7, s14
|
||||
|
@ -191,17 +191,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s5 }
|
||||
fldmias AO!, { s4 - s7 }
|
||||
fmacs s17 , s1, s8
|
||||
fmacs s18 , s2, s8
|
||||
fldmias AO!, { s6 - s7 }
|
||||
fldmias BO!, { s12 - s15 }
|
||||
//fldmias AO!, { s6 - s7 }
|
||||
fmacs s19 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s9
|
||||
fldmias BO!, { s12 - s13 }
|
||||
fmacs s21 , s1, s9
|
||||
fmacs s22 , s2, s9
|
||||
fldmias BO!, { s14 - s15 }
|
||||
//fldmias BO!, { s14 - s15 }
|
||||
fmacs s23 , s3, s9
|
||||
|
||||
fmacs s24 , s0, s10
|
||||
|
@ -248,10 +248,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.macro KERNEL4x4_SUB
|
||||
|
||||
flds s8 , [ BO ]
|
||||
pld [ BO , #B_PRE ]
|
||||
|
||||
flds s0 , [ AO ]
|
||||
pld [ AO , #A_PRE ]
|
||||
flds s1 , [ AO, #4 ]
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
|
@ -284,16 +282,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
pld [ CO1 , #C_PRE ]
|
||||
|
||||
ldr r3 , LDC
|
||||
add CO2 , CO1, r3
|
||||
flds s0, ALPHA
|
||||
add r4 , CO2, r3
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
fldmias CO1, { s8 - s11 }
|
||||
pld [ r4 , #C_PRE ]
|
||||
|
||||
fmacs s8 , s0 , s16
|
||||
flds s12, [CO2]
|
||||
|
@ -313,6 +308,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmacs s15, s0 , s23
|
||||
fsts s11, [CO1, #12 ]
|
||||
|
||||
pld [ CO1 , #C_PRE ]
|
||||
|
||||
fldmias r4, { s8 - s11 }
|
||||
|
||||
fmacs s8 , s0 , s24
|
||||
|
@ -324,9 +321,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmacs s11, s0 , s27
|
||||
fsts s15, [CO2, #12 ]
|
||||
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
add CO2, r4 , r3
|
||||
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
fldmias CO2, { s12 - s15 }
|
||||
|
||||
|
@ -339,7 +337,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fsts s11, [r4 , #12 ]
|
||||
fmacs s15, s0 , s31
|
||||
|
||||
pld [ r4 , #C_PRE ]
|
||||
fstmias CO2, { s12 - s15 }
|
||||
pld [ CO2 , #C_PRE ]
|
||||
|
||||
add CO1, CO1, #16
|
||||
|
||||
|
@ -891,78 +891,29 @@ _L4_M4_20:
|
|||
|
||||
|
||||
mov BO, BC
|
||||
asrs L , K1, #3 // L = L / 8
|
||||
cmp L , #3
|
||||
blt _L4_M4_30
|
||||
.align 5
|
||||
asrs L , K1, #1 // L = L / 8
|
||||
cmp L , #2
|
||||
blt _L4_M4_32
|
||||
|
||||
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
sub L, L, #2
|
||||
subs L, L, #2
|
||||
ble _L4_M4_22a
|
||||
.align 5
|
||||
|
||||
_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs L, L, #1
|
||||
bgt _L4_M4_22
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b _L4_M4_44
|
||||
|
||||
|
||||
_L4_M4_30:
|
||||
tst L, #3
|
||||
ble _L4_M4_40
|
||||
|
||||
tst L, #2
|
||||
ble _L4_M4_32
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
|
@ -974,13 +925,7 @@ _L4_M4_32:
|
|||
ble _L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b _L4_M4_44
|
||||
|
@ -993,7 +938,7 @@ _L4_M4_40:
|
|||
|
||||
_L4_M4_44:
|
||||
|
||||
ands L , K1, #7 // L = L % 8
|
||||
ands L , K1, #1 // L = L % 8
|
||||
ble _L4_M4_100
|
||||
|
||||
_L4_M4_46:
|
||||
|
|
|
@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2013/10/11 Saar
|
||||
* BLASTEST : xOK
|
||||
* CTEST : xOK
|
||||
* TEST : xOK
|
||||
* 2013/11/02 Saar
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
**************************************************************************************/
|
||||
|
||||
|
@ -218,6 +218,15 @@ _L4_M4_BEGIN:
|
|||
|
||||
_L4_M4_20:
|
||||
|
||||
pld [ AO1, #A_PRE ]
|
||||
pld [ AO2, #A_PRE ]
|
||||
pld [ AO3, #A_PRE ]
|
||||
pld [ AO4, #A_PRE ]
|
||||
COPY4x4
|
||||
|
||||
subs I , I , #1
|
||||
ble _L4_M4_40
|
||||
|
||||
COPY4x4
|
||||
|
||||
subs I , I , #1
|
||||
|
|
6
param.h
6
param.h
|
@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 192
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 128
|
||||
#define CGEMM_DEFAULT_P 96
|
||||
#define ZGEMM_DEFAULT_P 64
|
||||
|
||||
#define SGEMM_DEFAULT_Q 120
|
||||
#define SGEMM_DEFAULT_Q 240
|
||||
#define DGEMM_DEFAULT_Q 120
|
||||
#define CGEMM_DEFAULT_Q 120
|
||||
#define ZGEMM_DEFAULT_Q 120
|
||||
|
||||
#define SGEMM_DEFAULT_R 16384
|
||||
#define SGEMM_DEFAULT_R 12288
|
||||
#define DGEMM_DEFAULT_R 8192
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
|
Loading…
Reference in New Issue