small optimizations on sgemm_kernel for ARMV7
This commit is contained in:
parent
b3eab8fcb7
commit
2b801a00a5
|
@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
/**************************************************************************************
|
/**************************************************************************************
|
||||||
* 2013/10/13 Saar
|
* 2013/11/02 Saar
|
||||||
* BLASTEST : OK
|
* BLASTEST : OK
|
||||||
* CTEST : OK
|
* CTEST : OK
|
||||||
* TEST : OK
|
* TEST : OK
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* 2013/10/13 Saar
|
* 2013/11/02 Saar
|
||||||
* UNROLL_N 4
|
* UNROLL_N 4
|
||||||
* UNROLL_M 4
|
* UNROLL_M 4
|
||||||
* DGEMM_P 128
|
* DGEMM_P 128
|
||||||
* DGEMM_Q 240
|
* DGEMM_Q 240
|
||||||
* DGEMM_R 4096
|
* DGEMM_R 12288
|
||||||
* A_PRE 96
|
* A_PRE 128
|
||||||
* B_PRE 96
|
* B_PRE 128
|
||||||
* C_PRE 64
|
* C_PRE 32
|
||||||
*
|
*
|
||||||
* Performance on Odroid U2:
|
* Performance on Odroid U2:
|
||||||
*
|
*
|
||||||
* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS
|
* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
|
||||||
* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS
|
* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
|
||||||
* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS
|
* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
|
||||||
* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS
|
* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
|
||||||
**************************************************************************************/
|
**************************************************************************************/
|
||||||
|
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
|
@ -92,9 +92,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define K1 r7
|
#define K1 r7
|
||||||
#define BC r12
|
#define BC r12
|
||||||
|
|
||||||
#define A_PRE 96
|
#define A_PRE 128
|
||||||
#define B_PRE 96
|
#define B_PRE 128
|
||||||
#define C_PRE 64
|
#define C_PRE 32
|
||||||
|
|
||||||
/**************************************************************************************
|
/**************************************************************************************
|
||||||
* Macro definitions
|
* Macro definitions
|
||||||
|
@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro KERNEL4x4_I
|
.macro KERNEL4x4_I
|
||||||
|
|
||||||
|
pld [ AO , #A_PRE ]
|
||||||
fldmias AO!, { s0 - s1 }
|
fldmias AO!, { s0 - s1 }
|
||||||
pld [ AO , #A_PRE-8 ]
|
pld [ BO , #B_PRE ]
|
||||||
fldmias BO!, { s8 - s9 }
|
fldmias BO!, { s8 - s9 }
|
||||||
pld [ BO , #B_PRE-8 ]
|
|
||||||
|
|
||||||
fmuls s16 , s0, s8
|
fmuls s16 , s0, s8
|
||||||
fldmias AO!, { s2 - s3 }
|
fldmias AO!, { s2 - s3 }
|
||||||
|
@ -162,20 +162,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
pld [ AO , #A_PRE ]
|
pld [ AO , #A_PRE ]
|
||||||
fmacs s16 , s4, s12
|
fmacs s16 , s4, s12
|
||||||
fmacs s17 , s5, s12
|
fmacs s17 , s5, s12
|
||||||
fldmias AO!, { s0 - s1 }
|
fldmias AO!, { s0 - s3 }
|
||||||
fmacs s18 , s6, s12
|
fmacs s18 , s6, s12
|
||||||
pld [ BO , #B_PRE ]
|
pld [ BO , #B_PRE ]
|
||||||
fmacs s19 , s7, s12
|
fmacs s19 , s7, s12
|
||||||
|
|
||||||
fmacs s20 , s4, s13
|
fmacs s20 , s4, s13
|
||||||
fldmias AO!, { s2 - s3 }
|
fldmias BO!, { s8 - s11 }
|
||||||
fmacs s21 , s5, s13
|
fmacs s21 , s5, s13
|
||||||
fmacs s22 , s6, s13
|
fmacs s22 , s6, s13
|
||||||
fldmias BO!, { s8 - s9 }
|
//fldmias AO!, { s2 - s3 }
|
||||||
fmacs s23 , s7, s13
|
fmacs s23 , s7, s13
|
||||||
|
|
||||||
fmacs s24 , s4, s14
|
fmacs s24 , s4, s14
|
||||||
fldmias BO!, { s10 - s11 }
|
//fldmias BO!, { s10 - s11 }
|
||||||
fmacs s25 , s5, s14
|
fmacs s25 , s5, s14
|
||||||
fmacs s26 , s6, s14
|
fmacs s26 , s6, s14
|
||||||
fmacs s27 , s7, s14
|
fmacs s27 , s7, s14
|
||||||
|
@ -191,17 +191,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro KERNEL4x4_M1
|
.macro KERNEL4x4_M1
|
||||||
|
|
||||||
fmacs s16 , s0, s8
|
fmacs s16 , s0, s8
|
||||||
fldmias AO!, { s4 - s5 }
|
fldmias AO!, { s4 - s7 }
|
||||||
fmacs s17 , s1, s8
|
fmacs s17 , s1, s8
|
||||||
fmacs s18 , s2, s8
|
fmacs s18 , s2, s8
|
||||||
fldmias AO!, { s6 - s7 }
|
fldmias BO!, { s12 - s15 }
|
||||||
|
//fldmias AO!, { s6 - s7 }
|
||||||
fmacs s19 , s3, s8
|
fmacs s19 , s3, s8
|
||||||
|
|
||||||
fmacs s20 , s0, s9
|
fmacs s20 , s0, s9
|
||||||
fldmias BO!, { s12 - s13 }
|
|
||||||
fmacs s21 , s1, s9
|
fmacs s21 , s1, s9
|
||||||
fmacs s22 , s2, s9
|
fmacs s22 , s2, s9
|
||||||
fldmias BO!, { s14 - s15 }
|
//fldmias BO!, { s14 - s15 }
|
||||||
fmacs s23 , s3, s9
|
fmacs s23 , s3, s9
|
||||||
|
|
||||||
fmacs s24 , s0, s10
|
fmacs s24 , s0, s10
|
||||||
|
@ -248,10 +248,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro KERNEL4x4_SUB
|
.macro KERNEL4x4_SUB
|
||||||
|
|
||||||
flds s8 , [ BO ]
|
flds s8 , [ BO ]
|
||||||
pld [ BO , #B_PRE ]
|
|
||||||
|
|
||||||
flds s0 , [ AO ]
|
flds s0 , [ AO ]
|
||||||
pld [ AO , #A_PRE ]
|
|
||||||
flds s1 , [ AO, #4 ]
|
flds s1 , [ AO, #4 ]
|
||||||
|
|
||||||
fmacs s16 , s0, s8
|
fmacs s16 , s0, s8
|
||||||
|
@ -284,16 +282,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVE4x4
|
.macro SAVE4x4
|
||||||
pld [ CO1 , #C_PRE ]
|
|
||||||
|
|
||||||
ldr r3 , LDC
|
ldr r3 , LDC
|
||||||
add CO2 , CO1, r3
|
add CO2 , CO1, r3
|
||||||
flds s0, ALPHA
|
flds s0, ALPHA
|
||||||
add r4 , CO2, r3
|
add r4 , CO2, r3
|
||||||
pld [ CO2 , #C_PRE ]
|
|
||||||
|
|
||||||
fldmias CO1, { s8 - s11 }
|
fldmias CO1, { s8 - s11 }
|
||||||
pld [ r4 , #C_PRE ]
|
|
||||||
|
|
||||||
fmacs s8 , s0 , s16
|
fmacs s8 , s0 , s16
|
||||||
flds s12, [CO2]
|
flds s12, [CO2]
|
||||||
|
@ -313,6 +308,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmacs s15, s0 , s23
|
fmacs s15, s0 , s23
|
||||||
fsts s11, [CO1, #12 ]
|
fsts s11, [CO1, #12 ]
|
||||||
|
|
||||||
|
pld [ CO1 , #C_PRE ]
|
||||||
|
|
||||||
fldmias r4, { s8 - s11 }
|
fldmias r4, { s8 - s11 }
|
||||||
|
|
||||||
fmacs s8 , s0 , s24
|
fmacs s8 , s0 , s24
|
||||||
|
@ -324,9 +321,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmacs s11, s0 , s27
|
fmacs s11, s0 , s27
|
||||||
fsts s15, [CO2, #12 ]
|
fsts s15, [CO2, #12 ]
|
||||||
|
|
||||||
|
pld [ CO2 , #C_PRE ]
|
||||||
|
|
||||||
add CO2, r4 , r3
|
add CO2, r4 , r3
|
||||||
|
|
||||||
pld [ CO2 , #C_PRE ]
|
|
||||||
|
|
||||||
fldmias CO2, { s12 - s15 }
|
fldmias CO2, { s12 - s15 }
|
||||||
|
|
||||||
|
@ -339,7 +337,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fsts s11, [r4 , #12 ]
|
fsts s11, [r4 , #12 ]
|
||||||
fmacs s15, s0 , s31
|
fmacs s15, s0 , s31
|
||||||
|
|
||||||
|
pld [ r4 , #C_PRE ]
|
||||||
fstmias CO2, { s12 - s15 }
|
fstmias CO2, { s12 - s15 }
|
||||||
|
pld [ CO2 , #C_PRE ]
|
||||||
|
|
||||||
add CO1, CO1, #16
|
add CO1, CO1, #16
|
||||||
|
|
||||||
|
@ -891,78 +891,29 @@ _L4_M4_20:
|
||||||
|
|
||||||
|
|
||||||
mov BO, BC
|
mov BO, BC
|
||||||
asrs L , K1, #3 // L = L / 8
|
asrs L , K1, #1 // L = L / 8
|
||||||
cmp L , #3
|
cmp L , #2
|
||||||
blt _L4_M4_30
|
blt _L4_M4_32
|
||||||
.align 5
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
KERNEL4x4_I
|
KERNEL4x4_I
|
||||||
KERNEL4x4_M2
|
KERNEL4x4_M2
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
KERNEL4x4_M1
|
subs L, L, #2
|
||||||
KERNEL4x4_M2
|
ble _L4_M4_22a
|
||||||
KERNEL4x4_M1
|
.align 5
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
sub L, L, #2
|
|
||||||
|
|
||||||
_L4_M4_22:
|
_L4_M4_22:
|
||||||
|
|
||||||
KERNEL4x4_M1
|
KERNEL4x4_M1
|
||||||
KERNEL4x4_M2
|
KERNEL4x4_M2
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
subs L, L, #1
|
subs L, L, #1
|
||||||
bgt _L4_M4_22
|
bgt _L4_M4_22
|
||||||
|
|
||||||
KERNEL4x4_M1
|
_L4_M4_22a:
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_E
|
|
||||||
|
|
||||||
b _L4_M4_44
|
|
||||||
|
|
||||||
|
|
||||||
_L4_M4_30:
|
|
||||||
tst L, #3
|
|
||||||
ble _L4_M4_40
|
|
||||||
|
|
||||||
tst L, #2
|
|
||||||
ble _L4_M4_32
|
|
||||||
|
|
||||||
KERNEL4x4_I
|
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
KERNEL4x4_M1
|
||||||
KERNEL4x4_E
|
KERNEL4x4_E
|
||||||
|
|
||||||
|
@ -974,13 +925,7 @@ _L4_M4_32:
|
||||||
ble _L4_M4_40
|
ble _L4_M4_40
|
||||||
|
|
||||||
KERNEL4x4_I
|
KERNEL4x4_I
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_M2
|
|
||||||
KERNEL4x4_M1
|
|
||||||
KERNEL4x4_E
|
KERNEL4x4_E
|
||||||
|
|
||||||
b _L4_M4_44
|
b _L4_M4_44
|
||||||
|
@ -993,7 +938,7 @@ _L4_M4_40:
|
||||||
|
|
||||||
_L4_M4_44:
|
_L4_M4_44:
|
||||||
|
|
||||||
ands L , K1, #7 // L = L % 8
|
ands L , K1, #1 // L = L % 8
|
||||||
ble _L4_M4_100
|
ble _L4_M4_100
|
||||||
|
|
||||||
_L4_M4_46:
|
_L4_M4_46:
|
||||||
|
|
|
@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
/**************************************************************************************
|
/**************************************************************************************
|
||||||
* 2013/10/11 Saar
|
* 2013/11/02 Saar
|
||||||
* BLASTEST : xOK
|
* BLASTEST : OK
|
||||||
* CTEST : xOK
|
* CTEST : OK
|
||||||
* TEST : xOK
|
* TEST : OK
|
||||||
*
|
*
|
||||||
**************************************************************************************/
|
**************************************************************************************/
|
||||||
|
|
||||||
|
@ -218,6 +218,15 @@ _L4_M4_BEGIN:
|
||||||
|
|
||||||
_L4_M4_20:
|
_L4_M4_20:
|
||||||
|
|
||||||
|
pld [ AO1, #A_PRE ]
|
||||||
|
pld [ AO2, #A_PRE ]
|
||||||
|
pld [ AO3, #A_PRE ]
|
||||||
|
pld [ AO4, #A_PRE ]
|
||||||
|
COPY4x4
|
||||||
|
|
||||||
|
subs I , I , #1
|
||||||
|
ble _L4_M4_40
|
||||||
|
|
||||||
COPY4x4
|
COPY4x4
|
||||||
|
|
||||||
subs I , I , #1
|
subs I , I , #1
|
||||||
|
|
6
param.h
6
param.h
|
@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 192
|
#define SGEMM_DEFAULT_P 128
|
||||||
#define DGEMM_DEFAULT_P 128
|
#define DGEMM_DEFAULT_P 128
|
||||||
#define CGEMM_DEFAULT_P 96
|
#define CGEMM_DEFAULT_P 96
|
||||||
#define ZGEMM_DEFAULT_P 64
|
#define ZGEMM_DEFAULT_P 64
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 120
|
#define SGEMM_DEFAULT_Q 240
|
||||||
#define DGEMM_DEFAULT_Q 120
|
#define DGEMM_DEFAULT_Q 120
|
||||||
#define CGEMM_DEFAULT_Q 120
|
#define CGEMM_DEFAULT_Q 120
|
||||||
#define ZGEMM_DEFAULT_Q 120
|
#define ZGEMM_DEFAULT_Q 120
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_R 16384
|
#define SGEMM_DEFAULT_R 12288
|
||||||
#define DGEMM_DEFAULT_R 8192
|
#define DGEMM_DEFAULT_R 8192
|
||||||
#define CGEMM_DEFAULT_R 4096
|
#define CGEMM_DEFAULT_R 4096
|
||||||
#define ZGEMM_DEFAULT_R 4096
|
#define ZGEMM_DEFAULT_R 4096
|
||||||
|
|
Loading…
Reference in New Issue