align general register using to strmm_kernel_8x8
This commit is contained in:
parent
0e6eb8c247
commit
edb423d772
|
@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
|
@ -78,14 +77,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
// 17
|
||||
// 18 must save
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
// 27 must save
|
||||
// 20 must save pA0_2, pA0_3
|
||||
// 21 must save pA0_6, pA0_7
|
||||
// 22 must save pA1_2, pA1_3
|
||||
// 23 must save pA1_6, pA1_7
|
||||
// 24 must save pB0_2, pB0_3
|
||||
// 25 must save pB0_6, pB0_7
|
||||
// 26 must save pB1_2, pB1_3
|
||||
// 27 must save pB1_6, pB1_7
|
||||
// 28 must save
|
||||
// 29 frame
|
||||
// 30 link
|
||||
|
@ -155,13 +154,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ldr d3, [pA, #8]
|
||||
ldr d7, [pB, #8]
|
||||
|
||||
ldr x20, [pA], #16
|
||||
ldr x22, [pA], #16
|
||||
fmul v16.4s, v0.4s, v4.s[0]
|
||||
ldr x24, [pB], #16
|
||||
ldr x26, [pB], #16
|
||||
fmul v17.4s, v1.4s, v4.s[0]
|
||||
ldr x21, [pA], #8
|
||||
ldr x23, [pA], #8
|
||||
fmul v18.4s, v0.4s, v4.s[1]
|
||||
ldr x25, [pB], #8
|
||||
ldr x27, [pB], #8
|
||||
fmul v19.4s, v1.4s, v4.s[1]
|
||||
fmul v20.4s, v0.4s, v4.s[2]
|
||||
fmul v21.4s, v1.4s, v4.s[2]
|
||||
|
@ -179,21 +178,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNEL8x8_M1
|
||||
ldr d2, [pA], #8
|
||||
fmov v0.d[1], x18
|
||||
fmov v0.d[1], x20
|
||||
ldr d6, [pB], #8
|
||||
fmov v4.d[1], x22
|
||||
fmov v4.d[1], x24
|
||||
ldr d3, [pA, #8]
|
||||
fmov v1.d[1], x19
|
||||
fmov v1.d[1], x21
|
||||
ldr d7, [pB, #8]
|
||||
fmov v5.d[1], x23
|
||||
fmov v5.d[1], x25
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
ldr x20, [pA], #16
|
||||
ldr x22, [pA], #16
|
||||
fmla v17.4s, v1.4s, v4.s[0]
|
||||
ldr x24, [pB], #16
|
||||
ldr x26, [pB], #16
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
ldr x21, [pA], #8
|
||||
ldr x23, [pA], #8
|
||||
fmla v19.4s, v1.4s, v4.s[1]
|
||||
ldr x25, [pB], #8
|
||||
ldr x27, [pB], #8
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
fmla v21.4s, v1.4s, v4.s[2]
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
|
@ -210,21 +209,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro KERNEL8x8_M2
|
||||
ldr d0, [pA], #8
|
||||
fmov v2.d[1], x20
|
||||
fmov v2.d[1], x22
|
||||
ldr d4, [pB], #8
|
||||
fmov v6.d[1], x24
|
||||
fmov v6.d[1], x26
|
||||
ldr d1, [pA, #8]
|
||||
fmov v3.d[1], x21
|
||||
fmov v3.d[1], x23
|
||||
ldr d5, [pB, #8]
|
||||
fmov v7.d[1], x25
|
||||
fmov v7.d[1], x27
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
ldr x18, [pA], #16
|
||||
ldr x20, [pA], #16
|
||||
fmla v17.4s, v3.4s, v6.s[0]
|
||||
ldr x22, [pB], #16
|
||||
ldr x24, [pB], #16
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
ldr x19, [pA], #8
|
||||
ldr x21, [pA], #8
|
||||
fmla v19.4s, v3.4s, v6.s[1]
|
||||
ldr x23, [pB], #8
|
||||
ldr x25, [pB], #8
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
fmla v21.4s, v3.4s, v6.s[2]
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
|
@ -240,10 +239,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x8_E
|
||||
fmov v2.d[1], x20
|
||||
fmov v6.d[1], x24
|
||||
fmov v3.d[1], x21
|
||||
fmov v7.d[1], x25
|
||||
fmov v2.d[1], x22
|
||||
fmov v6.d[1], x26
|
||||
fmov v3.d[1], x23
|
||||
fmov v7.d[1], x27
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v17.4s, v3.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
|
@ -363,67 +362,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL4x8_I
|
||||
ld1 {v4.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v5.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ld1 {v0.4s}, [pA], #16
|
||||
ld1 {v4.4s, v5.4s}, [pB], #32
|
||||
|
||||
ldr d2, [pA], #8
|
||||
ldr d6, [pB], #8
|
||||
ldr d7, [pB, #8]
|
||||
ldr x21, [pA], #8
|
||||
fmul v16.4s, v0.4s, v4.s[0]
|
||||
ldr x26, [pB], #16
|
||||
fmul v18.4s, v0.4s, v4.s[1]
|
||||
ldr x27, [pB], #8
|
||||
fmul v20.4s, v0.4s, v4.s[2]
|
||||
fmul v22.4s, v0.4s, v4.s[3]
|
||||
fmul v24.4s, v0.4s, v5.s[0]
|
||||
fmul v26.4s, v0.4s, v5.s[1]
|
||||
fmul v28.4s, v0.4s, v5.s[2]
|
||||
fmul v30.4s, v0.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v7.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v2.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x8_M1
|
||||
ldr d2, [pA], #8
|
||||
fmov v0.d[1], x20
|
||||
ldr d6, [pB], #8
|
||||
fmov v4.d[1], x24
|
||||
ldr d7, [pB, #8]
|
||||
fmov v5.d[1], x25
|
||||
fmla v16.4s, v0.4s, v4.s[0]
|
||||
ldr x21, [pA], #8
|
||||
fmla v18.4s, v0.4s, v4.s[1]
|
||||
ldr x26, [pB], #16
|
||||
fmla v20.4s, v0.4s, v4.s[2]
|
||||
ldr x27, [pB], #8
|
||||
fmla v22.4s, v0.4s, v4.s[3]
|
||||
fmla v24.4s, v0.4s, v5.s[0]
|
||||
fmla v26.4s, v0.4s, v5.s[1]
|
||||
fmla v28.4s, v0.4s, v5.s[2]
|
||||
fmla v30.4s, v0.4s, v5.s[3]
|
||||
|
||||
ld1 {v6.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v7.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v2.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x8_M2
|
||||
ldr d0, [pA], #8
|
||||
fmov v2.d[1], x21
|
||||
ldr d4, [pB], #8
|
||||
fmov v6.d[1], x26
|
||||
ldr d5, [pB, #8]
|
||||
fmov v7.d[1], x27
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
ldr x20, [pA], #8
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
ldr x24, [pB], #16
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
ldr x25, [pB], #8
|
||||
fmla v22.4s, v2.4s, v6.s[3]
|
||||
fmla v24.4s, v2.4s, v7.s[0]
|
||||
fmla v26.4s, v2.4s, v7.s[1]
|
||||
fmla v28.4s, v2.4s, v7.s[2]
|
||||
fmla v30.4s, v2.4s, v7.s[3]
|
||||
|
||||
ld1 {v4.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v5.4s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x8_E
|
||||
fmov v2.d[1], x21
|
||||
fmov v6.d[1], x26
|
||||
fmov v7.d[1], x27
|
||||
fmla v16.4s, v2.4s, v6.s[0]
|
||||
fmla v18.4s, v2.4s, v6.s[1]
|
||||
fmla v20.4s, v2.4s, v6.s[2]
|
||||
|
@ -678,93 +679,90 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ld1 {v8.4s}, [pB], #16
|
||||
ld1 {v0.4s, v1.4s}, [pA], #32
|
||||
|
||||
ldr d9, [pB], #8
|
||||
ldr d2, [pA], #8
|
||||
ldr d3, [pA, #8]
|
||||
fmul v16.4s, v0.4s, v8.s[0]
|
||||
ldr x25, [pB], #8
|
||||
fmul v17.4s, v1.4s, v8.s[0]
|
||||
ldr x22, [pA], #16
|
||||
fmul v20.4s, v0.4s, v8.s[1]
|
||||
ldr x23, [pA], #8
|
||||
fmul v21.4s, v1.4s, v8.s[1]
|
||||
fmul v24.4s, v0.4s, v9.s[0]
|
||||
fmul v25.4s, v1.4s, v9.s[0]
|
||||
fmul v28.4s, v0.4s, v9.s[1]
|
||||
fmul v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v4.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ld1 {v5.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
fmul v24.4s, v0.4s, v8.s[2]
|
||||
fmul v25.4s, v1.4s, v8.s[2]
|
||||
fmul v28.4s, v0.4s, v8.s[3]
|
||||
fmul v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
ldr d9, [pB], #8
|
||||
fmov v8.d[1], x24
|
||||
ldr d2, [pA], #8
|
||||
fmov v0.d[1], x20
|
||||
ldr d3, [pA, #8]
|
||||
fmov v1.d[1], x21
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
ldr x25, [pB], #8
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
ldr x22, [pA], #16
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
ldr x23, [pA], #8
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
|
||||
ld1 {v12.2s, v13.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v4.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ld1 {v5.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
fmla v24.4s, v0.4s, v8.s[2]
|
||||
fmla v25.4s, v1.4s, v8.s[2]
|
||||
fmla v28.4s, v0.4s, v8.s[3]
|
||||
fmla v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ldr d8, [pB], #8
|
||||
fmov v9.d[1], x25
|
||||
ldr d0, [pA], #8
|
||||
fmov v2.d[1], x22
|
||||
ldr d1, [pA, #8]
|
||||
fmov v3.d[1], x23
|
||||
fmla v16.4s, v2.4s, v9.s[0]
|
||||
ldr x24, [pB], #8
|
||||
fmla v17.4s, v3.4s, v9.s[0]
|
||||
ldr x20, [pA], #16
|
||||
fmla v20.4s, v2.4s, v9.s[1]
|
||||
ldr x21, [pA], #8
|
||||
fmla v21.4s, v3.4s, v9.s[1]
|
||||
fmla v24.4s, v2.4s, v9.s[2]
|
||||
fmla v25.4s, v3.4s, v9.s[2]
|
||||
fmla v28.4s, v2.4s, v9.s[3]
|
||||
fmla v29.4s, v3.4s, v9.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.4s, v4.4s, v12.s[0]
|
||||
fmla v17.4s, v5.4s, v12.s[0]
|
||||
fmla v20.4s, v4.4s, v12.s[1]
|
||||
fmla v21.4s, v5.4s, v12.s[1]
|
||||
fmla v24.4s, v4.4s, v13.s[0]
|
||||
fmla v25.4s, v5.4s, v13.s[0]
|
||||
fmla v28.4s, v4.4s, v13.s[1]
|
||||
fmla v29.4s, v5.4s, v13.s[1]
|
||||
fmov v9.d[1], x25
|
||||
fmov v2.d[1], x22
|
||||
fmov v3.d[1], x23
|
||||
fmla v16.4s, v2.4s, v9.s[0]
|
||||
fmla v17.4s, v3.4s, v9.s[0]
|
||||
fmla v20.4s, v2.4s, v9.s[1]
|
||||
fmla v21.4s, v3.4s, v9.s[1]
|
||||
fmla v24.4s, v2.4s, v9.s[2]
|
||||
fmla v25.4s, v3.4s, v9.s[2]
|
||||
fmla v28.4s, v2.4s, v9.s[3]
|
||||
fmla v29.4s, v3.4s, v9.s[3]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v8.2s, v9.2s}, [pB]
|
||||
add pB, pB, #16
|
||||
ld1 {v0.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
ld1 {v1.4s}, [pA]
|
||||
add pA, pA, #16
|
||||
|
||||
ld1 {v8.4s}, [pB], #16
|
||||
ld1 {v0.4s, v1.4s}, [pA], #32
|
||||
fmla v16.4s, v0.4s, v8.s[0]
|
||||
fmla v17.4s, v1.4s, v8.s[0]
|
||||
fmla v20.4s, v0.4s, v8.s[1]
|
||||
fmla v21.4s, v1.4s, v8.s[1]
|
||||
fmla v24.4s, v0.4s, v9.s[0]
|
||||
fmla v25.4s, v1.4s, v9.s[0]
|
||||
fmla v28.4s, v0.4s, v9.s[1]
|
||||
fmla v29.4s, v1.4s, v9.s[1]
|
||||
fmla v24.4s, v0.4s, v8.s[2]
|
||||
fmla v25.4s, v1.4s, v8.s[2]
|
||||
fmla v28.4s, v0.4s, v8.s[3]
|
||||
fmla v29.4s, v1.4s, v8.s[3]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
|
|
Loading…
Reference in New Issue