small optimizations on sgemm_kernel for ARMV7

This commit is contained in:
wernsaar 2013-11-02 13:06:11 +01:00
parent b3eab8fcb7
commit 2b801a00a5
3 changed files with 54 additions and 100 deletions

View File

@ -26,28 +26,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/10/13 Saar
* 2013/11/02 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
*
* 2013/10/13 Saar
* 2013/11/02 Saar
* UNROLL_N 4
* UNROLL_M 4
* DGEMM_P 128
* DGEMM_Q 240
* DGEMM_R 4096
* A_PRE 96
* B_PRE 96
* C_PRE 64
* DGEMM_R 12288
* A_PRE 128
* B_PRE 128
* C_PRE 32
*
* Performance on Odroid U2:
* Performance on Odroid U2:
*
* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS
* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS
* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS
* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS
* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
**************************************************************************************/
#define ASSEMBLER
@ -92,9 +92,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K1 r7
#define BC r12
#define A_PRE 96
#define B_PRE 96
#define C_PRE 64
#define A_PRE 128
#define B_PRE 128
#define C_PRE 32
/**************************************************************************************
* Macro definitions
@ -123,10 +123,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_I
pld [ AO , #A_PRE ]
fldmias AO!, { s0 - s1 }
pld [ AO , #A_PRE-8 ]
pld [ BO , #B_PRE ]
fldmias BO!, { s8 - s9 }
pld [ BO , #B_PRE-8 ]
fmuls s16 , s0, s8
fldmias AO!, { s2 - s3 }
@ -162,20 +162,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ AO , #A_PRE ]
fmacs s16 , s4, s12
fmacs s17 , s5, s12
fldmias AO!, { s0 - s1 }
fldmias AO!, { s0 - s3 }
fmacs s18 , s6, s12
pld [ BO , #B_PRE ]
fmacs s19 , s7, s12
fmacs s20 , s4, s13
fldmias AO!, { s2 - s3 }
fldmias BO!, { s8 - s11 }
fmacs s21 , s5, s13
fmacs s22 , s6, s13
fldmias BO!, { s8 - s9 }
//fldmias AO!, { s2 - s3 }
fmacs s23 , s7, s13
fmacs s24 , s4, s14
fldmias BO!, { s10 - s11 }
//fldmias BO!, { s10 - s11 }
fmacs s25 , s5, s14
fmacs s26 , s6, s14
fmacs s27 , s7, s14
@ -191,17 +191,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_M1
fmacs s16 , s0, s8
fldmias AO!, { s4 - s5 }
fldmias AO!, { s4 - s7 }
fmacs s17 , s1, s8
fmacs s18 , s2, s8
fldmias AO!, { s6 - s7 }
fldmias BO!, { s12 - s15 }
//fldmias AO!, { s6 - s7 }
fmacs s19 , s3, s8
fmacs s20 , s0, s9
fldmias BO!, { s12 - s13 }
fmacs s21 , s1, s9
fmacs s22 , s2, s9
fldmias BO!, { s14 - s15 }
//fldmias BO!, { s14 - s15 }
fmacs s23 , s3, s9
fmacs s24 , s0, s10
@ -248,10 +248,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL4x4_SUB
flds s8 , [ BO ]
pld [ BO , #B_PRE ]
flds s0 , [ AO ]
pld [ AO , #A_PRE ]
flds s1 , [ AO, #4 ]
fmacs s16 , s0, s8
@ -284,16 +282,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVE4x4
pld [ CO1 , #C_PRE ]
ldr r3 , LDC
add CO2 , CO1, r3
flds s0, ALPHA
add r4 , CO2, r3
pld [ CO2 , #C_PRE ]
fldmias CO1, { s8 - s11 }
pld [ r4 , #C_PRE ]
fmacs s8 , s0 , s16
flds s12, [CO2]
@ -313,6 +308,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmacs s15, s0 , s23
fsts s11, [CO1, #12 ]
pld [ CO1 , #C_PRE ]
fldmias r4, { s8 - s11 }
fmacs s8 , s0 , s24
@ -324,9 +321,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmacs s11, s0 , s27
fsts s15, [CO2, #12 ]
pld [ CO2 , #C_PRE ]
add CO2, r4 , r3
pld [ CO2 , #C_PRE ]
fldmias CO2, { s12 - s15 }
@ -339,7 +337,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fsts s11, [r4 , #12 ]
fmacs s15, s0 , s31
pld [ r4 , #C_PRE ]
fstmias CO2, { s12 - s15 }
pld [ CO2 , #C_PRE ]
add CO1, CO1, #16
@ -891,78 +891,29 @@ _L4_M4_20:
mov BO, BC
asrs L , K1, #3 // L = L / 8
cmp L , #3
blt _L4_M4_30
.align 5
asrs L , K1, #1 // L = L / 8
cmp L , #2
blt _L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
sub L, L, #2
subs L, L, #2
ble _L4_M4_22a
.align 5
_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs L, L, #1
bgt _L4_M4_22
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b _L4_M4_44
_L4_M4_30:
tst L, #3
ble _L4_M4_40
tst L, #2
ble _L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
@ -974,13 +925,7 @@ _L4_M4_32:
ble _L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b _L4_M4_44
@ -993,7 +938,7 @@ _L4_M4_40:
_L4_M4_44:
ands L , K1, #7 // L = L % 8
ands L , K1, #1 // L = L % 8
ble _L4_M4_100
_L4_M4_46:

View File

@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/10/11 Saar
* BLASTEST : xOK
* CTEST : xOK
* TEST : xOK
* 2013/11/02 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
@ -218,6 +218,15 @@ _L4_M4_BEGIN:
_L4_M4_20:
pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
pld [ AO3, #A_PRE ]
pld [ AO4, #A_PRE ]
COPY4x4
subs I , I , #1
ble _L4_M4_40
COPY4x4
subs I , I , #1

View File

@ -1814,17 +1814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 192
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 128
#define CGEMM_DEFAULT_P 96
#define ZGEMM_DEFAULT_P 64
#define SGEMM_DEFAULT_Q 120
#define SGEMM_DEFAULT_Q 240
#define DGEMM_DEFAULT_Q 120
#define CGEMM_DEFAULT_Q 120
#define ZGEMM_DEFAULT_Q 120
#define SGEMM_DEFAULT_R 16384
#define SGEMM_DEFAULT_R 12288
#define DGEMM_DEFAULT_R 8192
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096