Cortex A57: Improvements to DGEMM 8x4 kernel
This commit is contained in:
parent
ff4c5deafa
commit
c54a29bb48
|
@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stp q0, q1, [pCRow0]
|
stp q0, q1, [pCRow0]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
add pCRow0, pCRow0, #32
|
||||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ldp q2, q3, [pCRow0]
|
ldp q2, q3, [pCRow0]
|
||||||
fmla v2.2d, v18.2d, alphaV0
|
fmla v2.2d, v18.2d, alphaV0
|
||||||
|
@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stp q4, q5, [pCRow1]
|
stp q4, q5, [pCRow1]
|
||||||
|
|
||||||
add pCRow1, pCRow1, #32
|
add pCRow1, pCRow1, #32
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ldp q6, q7, [pCRow1]
|
ldp q6, q7, [pCRow1]
|
||||||
fmla v6.2d, v22.2d, alphaV0
|
fmla v6.2d, v22.2d, alphaV0
|
||||||
|
@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stp q0, q1, [pCRow2]
|
stp q0, q1, [pCRow2]
|
||||||
|
|
||||||
add pCRow2, pCRow2, #32
|
add pCRow2, pCRow2, #32
|
||||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ldp q2, q3, [pCRow2]
|
ldp q2, q3, [pCRow2]
|
||||||
fmla v2.2d, v26.2d, alphaV0
|
fmla v2.2d, v26.2d, alphaV0
|
||||||
|
@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stp q4, q5, [pCRow3]
|
stp q4, q5, [pCRow3]
|
||||||
|
|
||||||
add pCRow3, pCRow3, #32
|
add pCRow3, pCRow3, #32
|
||||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ldp q6, q7, [pCRow3]
|
ldp q6, q7, [pCRow3]
|
||||||
fmla v6.2d, v30.2d, alphaV0
|
fmla v6.2d, v30.2d, alphaV0
|
||||||
|
@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE4x4
|
.macro SAVE4x4
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
|
|
||||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||||
fmla v8.2d, v16.2d, alphaV0
|
fmla v8.2d, v16.2d, alphaV0
|
||||||
fmla v9.2d, v17.2d, alphaV0
|
fmla v9.2d, v17.2d, alphaV0
|
||||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||||
|
|
||||||
add pCRow1, pCRow0, LDC
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
add pCRow0, pCRow0, #32
|
||||||
|
|
||||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||||
fmla v12.2d, v20.2d, alphaV0
|
fmla v12.2d, v20.2d, alphaV0
|
||||||
fmla v13.2d, v21.2d, alphaV0
|
fmla v13.2d, v21.2d, alphaV0
|
||||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||||
|
|
||||||
add pCRow2, pCRow1, LDC
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
add pCRow1, pCRow1, #32
|
||||||
|
|
||||||
ld1 {v8.2d, v9.2d}, [pCRow2]
|
ld1 {v8.2d, v9.2d}, [pCRow2]
|
||||||
fmla v8.2d, v24.2d, alphaV0
|
fmla v8.2d, v24.2d, alphaV0
|
||||||
fmla v9.2d, v25.2d, alphaV0
|
fmla v9.2d, v25.2d, alphaV0
|
||||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||||
|
|
||||||
add pCRow1, pCRow2, LDC
|
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||||
|
add pCRow2, pCRow2, #32
|
||||||
|
|
||||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
ld1 {v12.2d, v13.2d}, [pCRow3]
|
||||||
fmla v12.2d, v28.2d, alphaV0
|
fmla v12.2d, v28.2d, alphaV0
|
||||||
fmla v13.2d, v29.2d, alphaV0
|
fmla v13.2d, v29.2d, alphaV0
|
||||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
st1 {v12.2d, v13.2d}, [pCRow3]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||||
|
add pCRow3, pCRow3, #32
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE2x4
|
.macro SAVE2x4
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
|
|
||||||
ld1 {v8.2d}, [pCRow0]
|
ld1 {v8.2d}, [pCRow0]
|
||||||
fmla v8.2d, v16.2d, alphaV0
|
fmla v8.2d, v16.2d, alphaV0
|
||||||
st1 {v8.2d}, [pCRow0]
|
st1 {v8.2d}, [pCRow0]
|
||||||
|
|
||||||
add pCRow1, pCRow0, LDC
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
add pCRow0, pCRow0, #16
|
||||||
|
|
||||||
ld1 {v12.2d}, [pCRow1]
|
ld1 {v12.2d}, [pCRow1]
|
||||||
fmla v12.2d, v20.2d, alphaV0
|
fmla v12.2d, v20.2d, alphaV0
|
||||||
st1 {v12.2d}, [pCRow1]
|
st1 {v12.2d}, [pCRow1]
|
||||||
|
|
||||||
add pCRow2, pCRow1, LDC
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
add pCRow1, pCRow1, #16
|
||||||
|
|
||||||
ld1 {v8.2d}, [pCRow2]
|
ld1 {v8.2d}, [pCRow2]
|
||||||
fmla v8.2d, v24.2d, alphaV0
|
fmla v8.2d, v24.2d, alphaV0
|
||||||
st1 {v8.2d}, [pCRow2]
|
st1 {v8.2d}, [pCRow2]
|
||||||
|
|
||||||
add pCRow1, pCRow2, LDC
|
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||||
|
add pCRow2, pCRow2, #16
|
||||||
|
|
||||||
ld1 {v12.2d}, [pCRow1]
|
ld1 {v12.2d}, [pCRow3]
|
||||||
fmla v12.2d, v28.2d, alphaV0
|
fmla v12.2d, v28.2d, alphaV0
|
||||||
st1 {v12.2d}, [pCRow1]
|
st1 {v12.2d}, [pCRow3]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #16
|
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||||
|
add pCRow3, pCRow3, #16
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE1x4
|
.macro SAVE1x4
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
add pCRow1, pCRow0, LDC
|
|
||||||
|
|
||||||
ld1 {v8.d}[0], [pCRow0]
|
ld1 {v8.d}[0], [pCRow0]
|
||||||
ld1 {v8.d}[1], [pCRow1]
|
ld1 {v8.d}[1], [pCRow1]
|
||||||
|
@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
st1 {v8.d}[0], [pCRow0]
|
st1 {v8.d}[0], [pCRow0]
|
||||||
st1 {v8.d}[1], [pCRow1]
|
st1 {v8.d}[1], [pCRow1]
|
||||||
|
|
||||||
add pCRow2, pCRow1, LDC
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
add pCRow1, pCRow2, LDC
|
add pCRow0, pCRow0, #8
|
||||||
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
add pCRow1, pCRow1, #8
|
||||||
|
|
||||||
ld1 {v12.d}[0], [pCRow2]
|
ld1 {v12.d}[0], [pCRow2]
|
||||||
ld1 {v12.d}[1], [pCRow1]
|
ld1 {v12.d}[1], [pCRow3]
|
||||||
fmla v12.2d, v20.2d, alphaV0
|
fmla v12.2d, v20.2d, alphaV0
|
||||||
st1 {v12.d}[0], [pCRow2]
|
st1 {v12.d}[0], [pCRow2]
|
||||||
st1 {v12.d}[1], [pCRow1]
|
st1 {v12.d}[1], [pCRow3]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #8
|
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||||
|
add pCRow2, pCRow2, #8
|
||||||
|
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||||
|
add pCRow3, pCRow3, #8
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmla v18.2d, v2.2d, v8.d[0]
|
fmla v18.2d, v2.2d, v8.d[0]
|
||||||
fmla v19.2d, v3.2d, v8.d[0]
|
fmla v19.2d, v3.2d, v8.d[0]
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
fmla v20.2d, v0.2d, v8.d[1]
|
fmla v20.2d, v0.2d, v8.d[1]
|
||||||
fmla v21.2d, v1.2d, v8.d[1]
|
fmla v21.2d, v1.2d, v8.d[1]
|
||||||
fmla v22.2d, v2.2d, v8.d[1]
|
fmla v22.2d, v2.2d, v8.d[1]
|
||||||
|
@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE8x2
|
.macro SAVE8x2
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
add pCRow1, pCRow0, LDC
|
|
||||||
|
|
||||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||||
fmla v0.2d, v16.2d, alphaV0
|
fmla v0.2d, v16.2d, alphaV0
|
||||||
|
@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmla v3.2d, v19.2d, alphaV0
|
fmla v3.2d, v19.2d, alphaV0
|
||||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||||
|
|
||||||
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
add pCRow0, pCRow0, #64
|
||||||
|
|
||||||
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||||
fmla v4.2d, v20.2d, alphaV0
|
fmla v4.2d, v20.2d, alphaV0
|
||||||
fmla v5.2d, v21.2d, alphaV0
|
fmla v5.2d, v21.2d, alphaV0
|
||||||
|
@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmla v7.2d, v23.2d, alphaV0
|
fmla v7.2d, v23.2d, alphaV0
|
||||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #64
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
add pCRow1, pCRow1, #64
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE4x2
|
.macro SAVE4x2
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
|
|
||||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||||
fmla v8.2d, v16.2d, alphaV0
|
fmla v8.2d, v16.2d, alphaV0
|
||||||
fmla v9.2d, v17.2d, alphaV0
|
fmla v9.2d, v17.2d, alphaV0
|
||||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||||
|
|
||||||
add pCRow1, pCRow0, LDC
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
add pCRow0, pCRow0, #32
|
||||||
|
|
||||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||||
fmla v12.2d, v20.2d, alphaV0
|
fmla v12.2d, v20.2d, alphaV0
|
||||||
fmla v13.2d, v21.2d, alphaV0
|
fmla v13.2d, v21.2d, alphaV0
|
||||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #32
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
add pCRow1, pCRow1, #32
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE2x2
|
.macro SAVE2x2
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
|
|
||||||
ld1 {v8.2d}, [pCRow0]
|
ld1 {v8.2d}, [pCRow0]
|
||||||
fmla v8.2d, v16.2d, alphaV0
|
fmla v8.2d, v16.2d, alphaV0
|
||||||
st1 {v8.2d}, [pCRow0]
|
st1 {v8.2d}, [pCRow0]
|
||||||
|
|
||||||
add pCRow1 , pCRow0, LDC
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
|
add pCRow0, pCRow0, #16
|
||||||
|
|
||||||
ld1 {v12.2d}, [pCRow1]
|
ld1 {v12.2d}, [pCRow1]
|
||||||
fmla v12.2d, v20.2d, alphaV0
|
fmla v12.2d, v20.2d, alphaV0
|
||||||
st1 {v12.2d}, [pCRow1]
|
st1 {v12.2d}, [pCRow1]
|
||||||
|
|
||||||
add pCRow0, pCRow0, #16
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
add pCRow1, pCRow1, #16
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE1x2
|
.macro SAVE1x2
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
add pCRow1 , pCRow0, LDC
|
|
||||||
|
|
||||||
ld1 {v8.d}[0], [pCRow0]
|
ld1 {v8.d}[0], [pCRow0]
|
||||||
ld1 {v8.d}[1], [pCRow1]
|
ld1 {v8.d}[1], [pCRow1]
|
||||||
|
@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
st1 {v8.d}[0], [pCRow0]
|
st1 {v8.d}[0], [pCRow0]
|
||||||
st1 {v8.d}[1], [pCRow1]
|
st1 {v8.d}[1], [pCRow1]
|
||||||
|
|
||||||
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
add pCRow0, pCRow0, #8
|
add pCRow0, pCRow0, #8
|
||||||
|
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||||
|
add pCRow1, pCRow1, #8
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
fmla v16.2d, v0.2d, v8.d[0]
|
fmla v16.2d, v0.2d, v8.d[0]
|
||||||
fmla v17.2d, v1.2d, v8.d[0]
|
fmla v17.2d, v1.2d, v8.d[0]
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
fmla v18.2d, v2.2d, v8.d[0]
|
fmla v18.2d, v2.2d, v8.d[0]
|
||||||
fmla v19.2d, v3.2d, v8.d[0]
|
fmla v19.2d, v3.2d, v8.d[0]
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVE8x1
|
.macro SAVE8x1
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
|
|
||||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||||
fmla v0.2d, v16.2d, alphaV0
|
fmla v0.2d, v16.2d, alphaV0
|
||||||
fmla v1.2d, v17.2d, alphaV0
|
fmla v1.2d, v17.2d, alphaV0
|
||||||
|
@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmla v3.2d, v19.2d, alphaV0
|
fmla v3.2d, v19.2d, alphaV0
|
||||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
|
||||||
|
|
||||||
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
add pCRow0, pCRow0, #64
|
add pCRow0, pCRow0, #64
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE4x1
|
.macro SAVE4x1
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
|
|
||||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||||
fmla v8.2d, v16.2d, alphaV0
|
fmla v8.2d, v16.2d, alphaV0
|
||||||
fmla v9.2d, v17.2d, alphaV0
|
fmla v9.2d, v17.2d, alphaV0
|
||||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||||
|
|
||||||
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
add pCRow0, pCRow0, #32
|
add pCRow0, pCRow0, #32
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE2x1
|
.macro SAVE2x1
|
||||||
fmov alpha0, alpha
|
fmov alpha0, alpha
|
||||||
|
|
||||||
ld1 {v8.2d}, [pCRow0]
|
ld1 {v8.2d}, [pCRow0]
|
||||||
fmla v8.2d, v16.2d, alphaV0
|
fmla v8.2d, v16.2d, alphaV0
|
||||||
st1 {v8.2d}, [pCRow0]
|
st1 {v8.2d}, [pCRow0]
|
||||||
|
|
||||||
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
add pCRow0, pCRow0, #16
|
add pCRow0, pCRow0, #16
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmadd d8, d16, alpha0, d8
|
fmadd d8, d16, alpha0, d8
|
||||||
str d8, [pCRow0]
|
str d8, [pCRow0]
|
||||||
|
|
||||||
|
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||||
add pCRow0, pCRow0, #8
|
add pCRow0, pCRow0, #8
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L4_BEGIN:
|
dgemm_kernel_L4_BEGIN:
|
||||||
mov pCRow0, pC
|
mov pCRow0, pC
|
||||||
add pCRow1, pCRow0, LDC
|
add pCRow1, pCRow0, LDC
|
||||||
|
@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20:
|
||||||
cmp counterL , #0
|
cmp counterL , #0
|
||||||
ble dgemm_kernel_L4_M4_40
|
ble dgemm_kernel_L4_M4_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L4_M4_22:
|
dgemm_kernel_L4_M4_22:
|
||||||
|
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L4_M4_22
|
bgt dgemm_kernel_L4_M4_22
|
||||||
|
@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40:
|
||||||
dgemm_kernel_L4_M4_42:
|
dgemm_kernel_L4_M4_42:
|
||||||
|
|
||||||
KERNEL4x4_SUB
|
KERNEL4x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L4_M4_42
|
bgt dgemm_kernel_L4_M4_42
|
||||||
|
@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100:
|
||||||
|
|
||||||
dgemm_kernel_L4_M4_END:
|
dgemm_kernel_L4_M4_END:
|
||||||
|
|
||||||
|
|
||||||
dgemm_kernel_L4_M2_BEGIN:
|
dgemm_kernel_L4_M2_BEGIN:
|
||||||
|
|
||||||
mov counterI, origM
|
mov counterI, origM
|
||||||
|
@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20:
|
||||||
cmp counterL , #0
|
cmp counterL , #0
|
||||||
ble dgemm_kernel_L4_M2_40
|
ble dgemm_kernel_L4_M2_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L4_M2_22:
|
dgemm_kernel_L4_M2_22:
|
||||||
|
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
|
@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40:
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
ble dgemm_kernel_L4_M2_100
|
ble dgemm_kernel_L4_M2_100
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||||
dgemm_kernel_L4_M2_42:
|
dgemm_kernel_L4_M2_42:
|
||||||
|
|
||||||
KERNEL2x4_SUB
|
KERNEL2x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L4_M2_42
|
bgt dgemm_kernel_L4_M2_42
|
||||||
|
@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20:
|
||||||
cmp counterL , #0
|
cmp counterL , #0
|
||||||
ble dgemm_kernel_L4_M1_40
|
ble dgemm_kernel_L4_M1_40
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L4_M1_22:
|
dgemm_kernel_L4_M1_22:
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
|
KERNEL1x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
KERNEL1x4_SUB
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
|
@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40:
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
ble dgemm_kernel_L4_M1_100
|
ble dgemm_kernel_L4_M1_100
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
dgemm_kernel_L4_M1_42:
|
dgemm_kernel_L4_M1_42:
|
||||||
|
|
||||||
KERNEL1x4_SUB
|
KERNEL1x4_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L4_M1_42
|
bgt dgemm_kernel_L4_M1_42
|
||||||
|
@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
|
||||||
tst counterJ , #2
|
tst counterJ , #2
|
||||||
ble dgemm_kernel_L1_BEGIN
|
ble dgemm_kernel_L1_BEGIN
|
||||||
|
|
||||||
mov pCRow0, pC // pCRow0 = pC
|
mov pCRow0, pC
|
||||||
|
add pCRow1, pCRow0, LDC
|
||||||
|
|
||||||
add pC,pC,LDC, lsl #1
|
add pC, pCRow1, LDC
|
||||||
|
|
||||||
mov pA, origPA // pA = A
|
mov pA, origPA // pA = A
|
||||||
|
|
||||||
|
@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN:
|
||||||
cmp counterI, #0
|
cmp counterI, #0
|
||||||
ble dgemm_kernel_L2_M4_BEGIN
|
ble dgemm_kernel_L2_M4_BEGIN
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L2_M8_20:
|
dgemm_kernel_L2_M8_20:
|
||||||
|
|
||||||
INIT8x2
|
INIT8x2
|
||||||
|
@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20:
|
||||||
asr counterL , origK, #3 // counterL = counterL / 8
|
asr counterL , origK, #3 // counterL = counterL / 8
|
||||||
cmp counterL,#0
|
cmp counterL,#0
|
||||||
ble dgemm_kernel_L2_M8_40
|
ble dgemm_kernel_L2_M8_40
|
||||||
.align 5
|
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L2_M8_22:
|
dgemm_kernel_L2_M8_22:
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
|
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L2_M8_22
|
bgt dgemm_kernel_L2_M8_22
|
||||||
|
|
||||||
|
|
||||||
dgemm_kernel_L2_M8_40:
|
dgemm_kernel_L2_M8_40:
|
||||||
|
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
ble dgemm_kernel_L2_M8_100
|
ble dgemm_kernel_L2_M8_100
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||||
dgemm_kernel_L2_M8_42:
|
dgemm_kernel_L2_M8_42:
|
||||||
|
|
||||||
KERNEL8x2_SUB
|
KERNEL8x2_SUB
|
||||||
|
@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20:
|
||||||
asr counterL , origK, #3 // counterL = counterL / 8
|
asr counterL , origK, #3 // counterL = counterL / 8
|
||||||
cmp counterL,#0
|
cmp counterL,#0
|
||||||
ble dgemm_kernel_L2_M4_40
|
ble dgemm_kernel_L2_M4_40
|
||||||
.align 5
|
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L2_M4_22:
|
dgemm_kernel_L2_M4_22:
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
|
@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40:
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
ble dgemm_kernel_L2_M4_100
|
ble dgemm_kernel_L2_M4_100
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||||
dgemm_kernel_L2_M4_42:
|
dgemm_kernel_L2_M4_42:
|
||||||
|
|
||||||
KERNEL4x2_SUB
|
KERNEL4x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L2_M4_42
|
bgt dgemm_kernel_L2_M4_42
|
||||||
|
@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20:
|
||||||
dgemm_kernel_L2_M2_22:
|
dgemm_kernel_L2_M2_22:
|
||||||
|
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
|
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL2x2_SUB
|
KERNEL2x2_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L2_M2_22
|
bgt dgemm_kernel_L2_M2_22
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||||
dgemm_kernel_L2_M2_40:
|
dgemm_kernel_L2_M2_40:
|
||||||
|
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
|
@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20:
|
||||||
dgemm_kernel_L2_M1_22:
|
dgemm_kernel_L2_M1_22:
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
KERNEL1x2_SUB
|
KERNEL1x2_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L2_M1_22
|
bgt dgemm_kernel_L2_M1_22
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||||
dgemm_kernel_L2_M1_40:
|
dgemm_kernel_L2_M1_40:
|
||||||
|
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
|
@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN:
|
||||||
cmp counterI, #0
|
cmp counterI, #0
|
||||||
ble dgemm_kernel_L1_M4_BEGIN
|
ble dgemm_kernel_L1_M4_BEGIN
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L1_M8_20:
|
dgemm_kernel_L1_M8_20:
|
||||||
|
|
||||||
INIT8x1
|
INIT8x1
|
||||||
|
@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20:
|
||||||
asr counterL , origK, #3 // counterL = counterL / 8
|
asr counterL , origK, #3 // counterL = counterL / 8
|
||||||
cmp counterL , #0
|
cmp counterL , #0
|
||||||
ble dgemm_kernel_L1_M8_40
|
ble dgemm_kernel_L1_M8_40
|
||||||
.align 5
|
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L1_M8_22:
|
dgemm_kernel_L1_M8_22:
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
|
@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40:
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
ble dgemm_kernel_L1_M8_100
|
ble dgemm_kernel_L1_M8_100
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
dgemm_kernel_L1_M8_42:
|
dgemm_kernel_L1_M8_42:
|
||||||
|
|
||||||
KERNEL8x1_SUB
|
KERNEL8x1_SUB
|
||||||
|
@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20:
|
||||||
asr counterL , origK, #3 // counterL = counterL / 8
|
asr counterL , origK, #3 // counterL = counterL / 8
|
||||||
cmp counterL , #0
|
cmp counterL , #0
|
||||||
ble dgemm_kernel_L1_M4_40
|
ble dgemm_kernel_L1_M4_40
|
||||||
.align 5
|
|
||||||
|
|
||||||
|
.align 5
|
||||||
dgemm_kernel_L1_M4_22:
|
dgemm_kernel_L1_M4_22:
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
|
||||||
|
KERNEL4x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
KERNEL4x1_SUB
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
|
@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40:
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
ble dgemm_kernel_L1_M4_100
|
ble dgemm_kernel_L1_M4_100
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
dgemm_kernel_L1_M4_42:
|
dgemm_kernel_L1_M4_42:
|
||||||
|
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L1_M4_42
|
bgt dgemm_kernel_L1_M4_42
|
||||||
|
@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22:
|
||||||
|
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
|
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
KERNEL2x1_SUB
|
KERNEL2x1_SUB
|
||||||
|
|
||||||
subs counterL, counterL, #1
|
subs counterL, counterL, #1
|
||||||
bgt dgemm_kernel_L1_M2_22
|
bgt dgemm_kernel_L1_M2_22
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
dgemm_kernel_L1_M2_40:
|
dgemm_kernel_L1_M2_40:
|
||||||
|
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
|
@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20:
|
||||||
cmp counterL , #0
|
cmp counterL , #0
|
||||||
ble dgemm_kernel_L1_M1_40
|
ble dgemm_kernel_L1_M1_40
|
||||||
|
|
||||||
|
|
||||||
dgemm_kernel_L1_M1_22:
|
dgemm_kernel_L1_M1_22:
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
|
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
|
|
||||||
|
@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40:
|
||||||
ands counterL , origK, #7 // counterL = counterL % 8
|
ands counterL , origK, #7 // counterL = counterL % 8
|
||||||
ble dgemm_kernel_L1_M1_100
|
ble dgemm_kernel_L1_M1_100
|
||||||
|
|
||||||
|
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||||
|
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||||
dgemm_kernel_L1_M1_42:
|
dgemm_kernel_L1_M1_42:
|
||||||
|
|
||||||
KERNEL1x1_SUB
|
KERNEL1x1_SUB
|
||||||
|
|
Loading…
Reference in New Issue