align general register using to strmm_kernel_8x8

This commit is contained in:
张丹枫 2020-05-20 21:52:49 +08:00
parent 0e6eb8c247
commit edb423d772
1 changed files with 120 additions and 122 deletions

View File

@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
@ -78,14 +77,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 17
// 18 must save
// 19 must save
// 20 must save
// 21 must save
// 22 must save
// 23 must save
// 24 must save
// 25 must save
// 26 must save
// 27 must save
// 20 must save pA0_2, pA0_3
// 21 must save pA0_6, pA0_7
// 22 must save pA1_2, pA1_3
// 23 must save pA1_6, pA1_7
// 24 must save pB0_2, pB0_3
// 25 must save pB0_6, pB0_7
// 26 must save pB1_2, pB1_3
// 27 must save pB1_6, pB1_7
// 28 must save
// 29 frame
// 30 link
@ -155,13 +154,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ldr d3, [pA, #8]
ldr d7, [pB, #8]
ldr x20, [pA], #16
ldr x22, [pA], #16
fmul v16.4s, v0.4s, v4.s[0]
ldr x24, [pB], #16
ldr x26, [pB], #16
fmul v17.4s, v1.4s, v4.s[0]
ldr x21, [pA], #8
ldr x23, [pA], #8
fmul v18.4s, v0.4s, v4.s[1]
ldr x25, [pB], #8
ldr x27, [pB], #8
fmul v19.4s, v1.4s, v4.s[1]
fmul v20.4s, v0.4s, v4.s[2]
fmul v21.4s, v1.4s, v4.s[2]
@ -179,21 +178,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x8_M1
ldr d2, [pA], #8
fmov v0.d[1], x18
fmov v0.d[1], x20
ldr d6, [pB], #8
fmov v4.d[1], x22
fmov v4.d[1], x24
ldr d3, [pA, #8]
fmov v1.d[1], x19
fmov v1.d[1], x21
ldr d7, [pB, #8]
fmov v5.d[1], x23
fmov v5.d[1], x25
fmla v16.4s, v0.4s, v4.s[0]
ldr x20, [pA], #16
ldr x22, [pA], #16
fmla v17.4s, v1.4s, v4.s[0]
ldr x24, [pB], #16
ldr x26, [pB], #16
fmla v18.4s, v0.4s, v4.s[1]
ldr x21, [pA], #8
ldr x23, [pA], #8
fmla v19.4s, v1.4s, v4.s[1]
ldr x25, [pB], #8
ldr x27, [pB], #8
fmla v20.4s, v0.4s, v4.s[2]
fmla v21.4s, v1.4s, v4.s[2]
fmla v22.4s, v0.4s, v4.s[3]
@ -210,21 +209,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x8_M2
ldr d0, [pA], #8
fmov v2.d[1], x20
fmov v2.d[1], x22
ldr d4, [pB], #8
fmov v6.d[1], x24
fmov v6.d[1], x26
ldr d1, [pA, #8]
fmov v3.d[1], x21
fmov v3.d[1], x23
ldr d5, [pB, #8]
fmov v7.d[1], x25
fmov v7.d[1], x27
fmla v16.4s, v2.4s, v6.s[0]
ldr x18, [pA], #16
ldr x20, [pA], #16
fmla v17.4s, v3.4s, v6.s[0]
ldr x22, [pB], #16
ldr x24, [pB], #16
fmla v18.4s, v2.4s, v6.s[1]
ldr x19, [pA], #8
ldr x21, [pA], #8
fmla v19.4s, v3.4s, v6.s[1]
ldr x23, [pB], #8
ldr x25, [pB], #8
fmla v20.4s, v2.4s, v6.s[2]
fmla v21.4s, v3.4s, v6.s[2]
fmla v22.4s, v2.4s, v6.s[3]
@ -240,10 +239,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x8_E
fmov v2.d[1], x20
fmov v6.d[1], x24
fmov v3.d[1], x21
fmov v7.d[1], x25
fmov v2.d[1], x22
fmov v6.d[1], x26
fmov v3.d[1], x23
fmov v7.d[1], x27
fmla v16.4s, v2.4s, v6.s[0]
fmla v17.4s, v3.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.s[1]
@ -363,67 +362,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL4x8_I
ld1 {v4.4s}, [pB]
add pB, pB, #16
ld1 {v5.4s}, [pB]
add pB, pB, #16
ld1 {v0.4s}, [pA]
add pA, pA, #16
ld1 {v0.4s}, [pA], #16
ld1 {v4.4s, v5.4s}, [pB], #32
ldr d2, [pA], #8
ldr d6, [pB], #8
ldr d7, [pB, #8]
ldr x21, [pA], #8
fmul v16.4s, v0.4s, v4.s[0]
ldr x26, [pB], #16
fmul v18.4s, v0.4s, v4.s[1]
ldr x27, [pB], #8
fmul v20.4s, v0.4s, v4.s[2]
fmul v22.4s, v0.4s, v4.s[3]
fmul v24.4s, v0.4s, v5.s[0]
fmul v26.4s, v0.4s, v5.s[1]
fmul v28.4s, v0.4s, v5.s[2]
fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
ld1 {v7.4s}, [pB]
add pB, pB, #16
ld1 {v2.4s}, [pA]
add pA, pA, #16
.endm
.macro KERNEL4x8_M1
ldr d2, [pA], #8
fmov v0.d[1], x20
ldr d6, [pB], #8
fmov v4.d[1], x24
ldr d7, [pB, #8]
fmov v5.d[1], x25
fmla v16.4s, v0.4s, v4.s[0]
ldr x21, [pA], #8
fmla v18.4s, v0.4s, v4.s[1]
ldr x26, [pB], #16
fmla v20.4s, v0.4s, v4.s[2]
ldr x27, [pB], #8
fmla v22.4s, v0.4s, v4.s[3]
fmla v24.4s, v0.4s, v5.s[0]
fmla v26.4s, v0.4s, v5.s[1]
fmla v28.4s, v0.4s, v5.s[2]
fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
ld1 {v7.4s}, [pB]
add pB, pB, #16
ld1 {v2.4s}, [pA]
add pA, pA, #16
.endm
.macro KERNEL4x8_M2
ldr d0, [pA], #8
fmov v2.d[1], x21
ldr d4, [pB], #8
fmov v6.d[1], x26
ldr d5, [pB, #8]
fmov v7.d[1], x27
fmla v16.4s, v2.4s, v6.s[0]
ldr x20, [pA], #8
fmla v18.4s, v2.4s, v6.s[1]
ldr x24, [pB], #16
fmla v20.4s, v2.4s, v6.s[2]
ldr x25, [pB], #8
fmla v22.4s, v2.4s, v6.s[3]
fmla v24.4s, v2.4s, v7.s[0]
fmla v26.4s, v2.4s, v7.s[1]
fmla v28.4s, v2.4s, v7.s[2]
fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
ld1 {v5.4s}, [pB]
add pB, pB, #16
ld1 {v0.4s}, [pA]
add pA, pA, #16
.endm
.macro KERNEL4x8_E
fmov v2.d[1], x21
fmov v6.d[1], x26
fmov v7.d[1], x27
fmla v16.4s, v2.4s, v6.s[0]
fmla v18.4s, v2.4s, v6.s[1]
fmla v20.4s, v2.4s, v6.s[2]
@ -678,93 +679,90 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro KERNEL8x4_I
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
ld1 {v0.4s}, [pA]
add pA, pA, #16
ld1 {v1.4s}, [pA]
add pA, pA, #16
ld1 {v8.4s}, [pB], #16
ld1 {v0.4s, v1.4s}, [pA], #32
ldr d9, [pB], #8
ldr d2, [pA], #8
ldr d3, [pA, #8]
fmul v16.4s, v0.4s, v8.s[0]
ldr x25, [pB], #8
fmul v17.4s, v1.4s, v8.s[0]
ldr x22, [pA], #16
fmul v20.4s, v0.4s, v8.s[1]
ldr x23, [pA], #8
fmul v21.4s, v1.4s, v8.s[1]
fmul v24.4s, v0.4s, v9.s[0]
fmul v25.4s, v1.4s, v9.s[0]
fmul v28.4s, v0.4s, v9.s[1]
fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
ld1 {v4.4s}, [pA]
add pA, pA, #16
ld1 {v5.4s}, [pA]
add pA, pA, #16
fmul v24.4s, v0.4s, v8.s[2]
fmul v25.4s, v1.4s, v8.s[2]
fmul v28.4s, v0.4s, v8.s[3]
fmul v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL8x4_M1
ldr d9, [pB], #8
fmov v8.d[1], x24
ldr d2, [pA], #8
fmov v0.d[1], x20
ldr d3, [pA, #8]
fmov v1.d[1], x21
fmla v16.4s, v0.4s, v8.s[0]
ldr x25, [pB], #8
fmla v17.4s, v1.4s, v8.s[0]
ldr x22, [pA], #16
fmla v20.4s, v0.4s, v8.s[1]
ldr x23, [pA], #8
fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
ld1 {v4.4s}, [pA]
add pA, pA, #16
ld1 {v5.4s}, [pA]
add pA, pA, #16
fmla v24.4s, v0.4s, v8.s[2]
fmla v25.4s, v1.4s, v8.s[2]
fmla v28.4s, v0.4s, v8.s[3]
fmla v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL8x4_M2
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
ld1 {v0.4s}, [pA]
add pA, pA, #16
ld1 {v1.4s}, [pA]
add pA, pA, #16
ldr d8, [pB], #8
fmov v9.d[1], x25
ldr d0, [pA], #8
fmov v2.d[1], x22
ldr d1, [pA, #8]
fmov v3.d[1], x23
fmla v16.4s, v2.4s, v9.s[0]
ldr x24, [pB], #8
fmla v17.4s, v3.4s, v9.s[0]
ldr x20, [pA], #16
fmla v20.4s, v2.4s, v9.s[1]
ldr x21, [pA], #8
fmla v21.4s, v3.4s, v9.s[1]
fmla v24.4s, v2.4s, v9.s[2]
fmla v25.4s, v3.4s, v9.s[2]
fmla v28.4s, v2.4s, v9.s[3]
fmla v29.4s, v3.4s, v9.s[3]
.endm
.macro KERNEL8x4_E
fmla v16.4s, v4.4s, v12.s[0]
fmla v17.4s, v5.4s, v12.s[0]
fmla v20.4s, v4.4s, v12.s[1]
fmla v21.4s, v5.4s, v12.s[1]
fmla v24.4s, v4.4s, v13.s[0]
fmla v25.4s, v5.4s, v13.s[0]
fmla v28.4s, v4.4s, v13.s[1]
fmla v29.4s, v5.4s, v13.s[1]
fmov v9.d[1], x25
fmov v2.d[1], x22
fmov v3.d[1], x23
fmla v16.4s, v2.4s, v9.s[0]
fmla v17.4s, v3.4s, v9.s[0]
fmla v20.4s, v2.4s, v9.s[1]
fmla v21.4s, v3.4s, v9.s[1]
fmla v24.4s, v2.4s, v9.s[2]
fmla v25.4s, v3.4s, v9.s[2]
fmla v28.4s, v2.4s, v9.s[3]
fmla v29.4s, v3.4s, v9.s[3]
.endm
.macro KERNEL8x4_SUB
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
ld1 {v0.4s}, [pA]
add pA, pA, #16
ld1 {v1.4s}, [pA]
add pA, pA, #16
ld1 {v8.4s}, [pB], #16
ld1 {v0.4s, v1.4s}, [pA], #32
fmla v16.4s, v0.4s, v8.s[0]
fmla v17.4s, v1.4s, v8.s[0]
fmla v20.4s, v0.4s, v8.s[1]
fmla v21.4s, v1.4s, v8.s[1]
fmla v24.4s, v0.4s, v9.s[0]
fmla v25.4s, v1.4s, v9.s[0]
fmla v28.4s, v0.4s, v9.s[1]
fmla v29.4s, v1.4s, v9.s[1]
fmla v24.4s, v0.4s, v8.s[2]
fmla v25.4s, v1.4s, v8.s[2]
fmla v28.4s, v0.4s, v8.s[3]
fmla v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE8x4