Merge pull request #3845 from Mousius/asimd-dot-opt
Remove unnecessary instructions from Advanced SIMD dot
This commit is contained in:
		
						commit
						b6a4ef98b9
					
				|  | @ -1,5 +1,6 @@ | ||||||
| /***************************************************************************
 | /***************************************************************************
 | ||||||
| Copyright (c) 2017, The OpenBLAS Project | Copyright (c) 2017, The OpenBLAS Project | ||||||
|  | Copyright (c) 2022, Arm Ltd | ||||||
| All rights reserved. | All rights reserved. | ||||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||||
| modification, are permitted provided that the following conditions are | modification, are permitted provided that the following conditions are | ||||||
|  | @ -36,25 +37,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| #define RETURN_TYPE	double | #define RETURN_TYPE	double | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #define N		"x0"	/* vector length */ |  | ||||||
| #define X		"x1"	/* "X" vector address */ |  | ||||||
| #define INC_X		"x2"	/* "X" stride */ |  | ||||||
| #define Y		"x3"	/* "Y" vector address */ |  | ||||||
| #define INC_Y		"x4"	/* "Y" stride */ |  | ||||||
| #define J		"x5"	/* loop variable */ |  | ||||||
| 
 |  | ||||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||||
| #if !defined(DSDOT) | #if !defined(DSDOT) | ||||||
|  | #define DOT_MOD	"s" | ||||||
| #define REG0		"wzr" | #define REG0		"wzr" | ||||||
| #define DOTF		"s0" |  | ||||||
| #define TMPX		"s16" | #define TMPX		"s16" | ||||||
| #define TMPY		"s24" | #define TMPY		"s24" | ||||||
| #define INC_SHIFT	"2" | #define INC_SHIFT	"2" | ||||||
| #define N_DIV_SHIFT	"6" | #define N_DIV_SHIFT	"6" | ||||||
| #define N_REM_MASK	"63" | #define N_REM_MASK	"63" | ||||||
| #else | #else | ||||||
|  | #define DOT_MOD	"d" | ||||||
| #define REG0		"xzr" | #define REG0		"xzr" | ||||||
| #define DOTF		"d0" |  | ||||||
| #define TMPX		"s16" | #define TMPX		"s16" | ||||||
| #define TMPX1		"d2" | #define TMPX1		"d2" | ||||||
| #define TMPY		"s24" | #define TMPY		"s24" | ||||||
|  | @ -64,8 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| #define N_REM_MASK	"15" | #define N_REM_MASK	"15" | ||||||
| #endif | #endif | ||||||
| #else | #else | ||||||
|  | #define DOT_MOD	"d" | ||||||
| #define REG0		"xzr" | #define REG0		"xzr" | ||||||
| #define DOTF		"d0" |  | ||||||
| #define TMPX		"d16" | #define TMPX		"d16" | ||||||
| #define TMPY		"d24" | #define TMPY		"d24" | ||||||
| #define INC_SHIFT	"3" | #define INC_SHIFT	"3" | ||||||
|  | @ -73,59 +67,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| #define N_REM_MASK	"31" | #define N_REM_MASK	"31" | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | #define OUT		"%"DOT_MOD"[DOT_]" | ||||||
|  | 
 | ||||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||||
| 
 | 
 | ||||||
| #if !defined(DSDOT) | #if !defined(DSDOT) | ||||||
| #define KERNEL_F1						\ | #define KERNEL_F1						\ | ||||||
| 	"	ldr	"TMPX", ["X"]			\n"	\ | 	"	ldr	"TMPX", [%[X_]]			\n"	\ | ||||||
| 	"	ldr	"TMPY", ["Y"]			\n"	\ | 	"	ldr	"TMPY", [%[Y_]]			\n"	\ | ||||||
| 	"	add	"X", "X", "INC_X"		\n"	\ | 	"	add	%[X_], %[X_], %[INCX_]		\n"	\ | ||||||
| 	"	add	"Y", "Y", "INC_Y"		\n"	\ | 	"	add	%[Y_], %[Y_], %[INCY_]		\n"	\ | ||||||
| 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"  \n" | 	"	fmadd	"OUT", "TMPX", "TMPY", "OUT"    \n" | ||||||
| 
 | 
 | ||||||
| #define KERNEL_F						\ | #define KERNEL_F						\ | ||||||
| 	"	ldp	q16, q17, ["X"]			\n"	\ | 	"	ldp	q16, q17, [%[X_]]		\n"	\ | ||||||
| 	"	ldp	q24, q25, ["Y"]			\n"	\ | 	"	ldp	q24, q25, [%[Y_]]		\n"	\ | ||||||
| 	"	ldp	q18, q19, ["X", #32]		\n"	\ | 	"	ldp	q18, q19, [%[X_], #32]		\n"	\ | ||||||
| 	"	ldp	q26, q27, ["Y", #32]		\n"	\ | 	"	ldp	q26, q27, [%[Y_], #32]		\n"	\ | ||||||
| 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\ | 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\ | ||||||
| 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\ | 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\ | ||||||
| 	"	ldp	q20, q21, ["X", #64]		\n"	\ | 	"	ldp	q20, q21, [%[X_], #64]		\n"	\ | ||||||
| 	"	ldp	q28, q29, ["Y", #64]		\n"	\ | 	"	ldp	q28, q29, [%[Y_], #64]		\n"	\ | ||||||
| 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\ | 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\ | ||||||
| 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\ | 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\ | ||||||
| 	"	ldp	q22, q23, ["X", #96]		\n"	\ | 	"	ldp	q22, q23, [%[X_], #96]		\n"	\ | ||||||
| 	"	ldp	q30, q31, ["Y", #96]		\n"	\ | 	"	ldp	q30, q31, [%[Y_], #96]		\n"	\ | ||||||
| 	"	add	"Y", "Y", #128			\n"	\ | 	"	add	%[Y_], %[Y_], #128		\n"	\ | ||||||
| 	"	add	"X", "X", #128			\n"	\ | 	"	add	%[X_], %[X_], #128		\n"	\ | ||||||
| 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\ | 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\ | ||||||
| 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\ | 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\ | ||||||
| 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\ | 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\ | ||||||
| 	"	fmla	v7.4s, v23.4s, v31.4s		\n"	\ | 	"	fmla	v7.4s, v23.4s, v31.4s		\n"	\ | ||||||
| 	"	ldp	q16, q17, ["X"]			\n"	\ | 	"	ldp	q16, q17, [%[X_]]		\n"	\ | ||||||
| 	"	ldp	q24, q25, ["Y"]			\n"	\ | 	"	ldp	q24, q25, [%[Y_]]		\n"	\ | ||||||
| 	"	ldp	q18, q19, ["X", #32]		\n"	\ | 	"	ldp	q18, q19, [%[X_], #32]		\n"	\ | ||||||
| 	"	ldp	q26, q27, ["Y", #32]		\n"	\ | 	"	ldp	q26, q27, [%[Y_], #32]		\n"	\ | ||||||
| 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\ | 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\ | ||||||
| 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\ | 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\ | ||||||
| 	"	ldp	q20, q21, ["X", #64]		\n"	\ | 	"	ldp	q20, q21, [%[X_], #64]		\n"	\ | ||||||
| 	"	ldp	q28, q29, ["Y", #64]		\n"	\ | 	"	ldp	q28, q29, [%[Y_], #64]		\n"	\ | ||||||
| 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\ | 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\ | ||||||
| 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\ | 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\ | ||||||
| 	"	ldp	q22, q23, ["X", #96]		\n"	\ | 	"	ldp	q22, q23, [%[X_], #96]		\n"	\ | ||||||
| 	"	ldp	q30, q31, ["Y", #96]		\n"	\ | 	"	ldp	q30, q31, [%[Y_], #96]		\n"	\ | ||||||
| 	"	add	"Y", "Y", #128			\n"	\ | 	"	add	%[Y_], %[Y_], #128		\n"	\ | ||||||
| 	"	add	"X", "X", #128			\n"	\ | 	"	add	%[X_], %[X_], #128		\n"	\ | ||||||
| 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\ | 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\ | ||||||
| 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\ | 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\ | ||||||
| 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\ | 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\ | ||||||
| 	"	fmla	v7.4s, v23.4s, v31.4s		\n" | 	"	fmla	v7.4s, v23.4s, v31.4s		\n" | ||||||
| 
 | 
 | ||||||
|  | @ -142,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| 
 | 
 | ||||||
| #else /* !defined(DSDOT) */ | #else /* !defined(DSDOT) */ | ||||||
| #define KERNEL_F1						\ | #define KERNEL_F1						\ | ||||||
| 	"	ldr	"TMPX", ["X"]			\n"	\ | 	"	ldr	"TMPX", [%[X_]]			\n"	\ | ||||||
| 	"	ldr	"TMPY", ["Y"]			\n"	\ | 	"	ldr	"TMPY", [%[Y_]]			\n"	\ | ||||||
| 	"	add	"X", "X", "INC_X"		\n"	\ | 	"	add	%[X_], %[X_], %[INCX_]		\n"	\ | ||||||
| 	"	add	"Y", "Y", "INC_Y"		\n"	\ | 	"	add	%[Y_], %[Y_], %[INCY_]		\n"	\ | ||||||
| 	"	fcvt	"TMPX1", "TMPX"			\n"	\ | 	"	fcvt	"TMPX1", "TMPX"			\n"	\ | ||||||
| 	"	fcvt	"TMPY1", "TMPY"			\n"	\ | 	"	fcvt	"TMPY1", "TMPY"			\n"	\ | ||||||
| 	"	fmul	"TMPX1", "TMPX1", "TMPY1"	\n"	\ | 	"	fmul	"TMPX1", "TMPX1", "TMPY1"	\n"	\ | ||||||
| 	"	fadd	"DOTF", "DOTF", "TMPX1"		\n" | 	"	fadd	"OUT", "OUT", "TMPX1"	 	\n" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| #define KERNEL_F						\ | #define KERNEL_F						\ | ||||||
| 	"	ldp	q18, q19, ["X"]			\n"	\ | 	"	ldp	q18, q19, [%[X_]]		\n"	\ | ||||||
| 	"	ldp	q26, q27, ["Y"]			\n"	\ | 	"	ldp	q26, q27, [%[Y_]]		\n"	\ | ||||||
| 	"	fcvtl	v16.2d, v18.2s			\n"	\ | 	"	fcvtl	v16.2d, v18.2s			\n"	\ | ||||||
| 	"	fcvtl2	v17.2d, v18.4s			\n"	\ | 	"	fcvtl2	v17.2d, v18.4s			\n"	\ | ||||||
| 	"	fcvtl	v18.2d, v19.2s			\n"	\ | 	"	fcvtl	v18.2d, v19.2s			\n"	\ | ||||||
|  | @ -163,8 +159,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| 	"	fcvtl2	v25.2d, v26.4s			\n"	\ | 	"	fcvtl2	v25.2d, v26.4s			\n"	\ | ||||||
| 	"	fcvtl	v26.2d, v27.2s			\n"	\ | 	"	fcvtl	v26.2d, v27.2s			\n"	\ | ||||||
| 	"	fcvtl2	v27.2d, v27.4s			\n"	\ | 	"	fcvtl2	v27.2d, v27.4s			\n"	\ | ||||||
| 	"	ldp	q22, q23, ["X", #32]		\n"	\ | 	"	ldp	q22, q23, [%[X_], #32]		\n"	\ | ||||||
| 	"	ldp	q30, q31, ["Y", #32]		\n"	\ | 	"	ldp	q30, q31, [%[Y_], #32]		\n"	\ | ||||||
| 	"	fcvtl	v20.2d, v22.2s			\n"	\ | 	"	fcvtl	v20.2d, v22.2s			\n"	\ | ||||||
| 	"	fcvtl2	v21.2d, v22.4s			\n"	\ | 	"	fcvtl2	v21.2d, v22.4s			\n"	\ | ||||||
| 	"	fcvtl	v22.2d, v23.2s			\n"	\ | 	"	fcvtl	v22.2d, v23.2s			\n"	\ | ||||||
|  | @ -173,16 +169,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| 	"	fcvtl2	v29.2d, v30.4s			\n"	\ | 	"	fcvtl2	v29.2d, v30.4s			\n"	\ | ||||||
| 	"	fcvtl	v30.2d, v31.2s			\n"	\ | 	"	fcvtl	v30.2d, v31.2s			\n"	\ | ||||||
| 	"	fcvtl2	v31.2d, v31.4s			\n"	\ | 	"	fcvtl2	v31.2d, v31.4s			\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\ | ||||||
| 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\ | 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\ | ||||||
| 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\ | 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\ | ||||||
| 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\ | 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\ | ||||||
| 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\ | 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\ | ||||||
| 	"	add	"Y", "Y", #64			\n"	\ | 	"	add	%[Y_], %[Y_], #64		\n"	\ | ||||||
| 	"	add	"X", "X", #64			\n"	\ | 	"	add	%[X_], %[X_], #64		\n"	\ | ||||||
| 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\ | 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\ | ||||||
| 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\ | 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\ | ||||||
| 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\ | 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\ | ||||||
|  | @ -196,60 +192,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\ | 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\ | ||||||
| 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\ | 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\ | ||||||
| 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\ | 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\ | ||||||
| 	"	faddp	"DOTF", v0.2d			\n" | 	"	faddp	"OUT", v0.2d			\n" | ||||||
| #endif /* !defined(DSDOT) */ | #endif /* !defined(DSDOT) */ | ||||||
| 
 | 
 | ||||||
| #else /* !defined(DOUBLE) */ | #else /* !defined(DOUBLE) */ | ||||||
| #define KERNEL_F1						\ | #define KERNEL_F1						\ | ||||||
| 	"	ldr	"TMPX", ["X"]			\n"	\ | 	"	ldr	"TMPX", [%[X_]]			\n"	\ | ||||||
| 	"	ldr	"TMPY", ["Y"]			\n"	\ | 	"	ldr	"TMPY", [%[Y_]]			\n"	\ | ||||||
| 	"	add	"X", "X", "INC_X"		\n"	\ | 	"	add	%[X_], %[X_], %[INCX_]		\n"	\ | ||||||
| 	"	add	"Y", "Y", "INC_Y"		\n"	\ | 	"	add	%[Y_], %[Y_], %[INCY_]		\n"	\ | ||||||
| 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"  \n" | 	"	fmadd	"OUT", "TMPX", "TMPY", "OUT"    \n" | ||||||
| 
 | 
 | ||||||
| #define KERNEL_F						\ | #define KERNEL_F						\ | ||||||
| 	"	ldp	q16, q17, ["X"]			\n"	\ | 	"	ldp	q16, q17, [%[X_]]		\n"	\ | ||||||
| 	"	ldp	q24, q25, ["Y"]			\n"	\ | 	"	ldp	q24, q25, [%[Y_]]		\n"	\ | ||||||
| 	"	ldp	q18, q19, ["X", #32]		\n"	\ | 	"	ldp	q18, q19, [%[X_], #32]		\n"	\ | ||||||
| 	"	ldp	q26, q27, ["Y", #32]		\n"	\ | 	"	ldp	q26, q27, [%[Y_], #32]		\n"	\ | ||||||
| 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\ | 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\ | ||||||
| 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\ | 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\ | ||||||
| 	"	ldp	q20, q21, ["X", #64]		\n"	\ | 	"	ldp	q20, q21, [%[X_], #64]		\n"	\ | ||||||
| 	"	ldp	q28, q29, ["Y", #64]		\n"	\ | 	"	ldp	q28, q29, [%[Y_], #64]		\n"	\ | ||||||
| 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\ | 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\ | ||||||
| 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\ | 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\ | ||||||
| 	"	ldp	q22, q23, ["X", #96]		\n"	\ | 	"	ldp	q22, q23, [%[X_], #96]		\n"	\ | ||||||
| 	"	ldp	q30, q31, ["Y", #96]		\n"	\ | 	"	ldp	q30, q31, [%[Y_], #96]		\n"	\ | ||||||
| 	"	add	"Y", "Y", #128			\n"	\ | 	"	add	%[Y_], %[Y_], #128		\n"	\ | ||||||
| 	"	add	"X", "X", #128			\n"	\ | 	"	add	%[X_], %[X_], #128		\n"	\ | ||||||
| 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\ | 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\ | ||||||
| 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\ | 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\ | ||||||
| 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\ | 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\ | ||||||
| 	"	fmla	v7.2d, v23.2d, v31.2d		\n"	\ | 	"	fmla	v7.2d, v23.2d, v31.2d		\n"	\ | ||||||
| 	"	ldp	q16, q17, ["X"]			\n"	\ | 	"	ldp	q16, q17, [%[X_]]		\n"	\ | ||||||
| 	"	ldp	q24, q25, ["Y"]			\n"	\ | 	"	ldp	q24, q25, [%[Y_]]		\n"	\ | ||||||
| 	"	ldp	q18, q19, ["X", #32]		\n"	\ | 	"	ldp	q18, q19, [%[X_], #32]		\n"	\ | ||||||
| 	"	ldp	q26, q27, ["Y", #32]		\n"	\ | 	"	ldp	q26, q27, [%[Y_], #32]		\n"	\ | ||||||
| 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\ | 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\ | ||||||
| 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\ | 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\ | ||||||
| 	"	ldp	q20, q21, ["X", #64]		\n"	\ | 	"	ldp	q20, q21, [%[X_], #64]		\n"	\ | ||||||
| 	"	ldp	q28, q29, ["Y", #64]		\n"	\ | 	"	ldp	q28, q29, [%[Y_], #64]		\n"	\ | ||||||
| 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\ | 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\ | ||||||
| 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\ | 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\ | ||||||
| 	"	ldp	q22, q23, ["X", #96]		\n"	\ | 	"	ldp	q22, q23, [%[X_], #96]		\n"	\ | ||||||
| 	"	ldp	q30, q31, ["Y", #96]		\n"	\ | 	"	ldp	q30, q31, [%[Y_], #96]		\n"	\ | ||||||
| 	"	add	"Y", "Y", #128			\n"	\ | 	"	add	%[Y_], %[Y_], #128		\n"	\ | ||||||
| 	"	add	"X", "X", #128			\n"	\ | 	"	add	%[X_], %[X_], #128		\n"	\ | ||||||
| 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\ | 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\ | ||||||
| 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\ | 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\ | ||||||
| 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\ | 	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\ | ||||||
| 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\ | 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\ | ||||||
| 	"	fmla	v7.2d, v23.2d, v31.2d		\n" | 	"	fmla	v7.2d, v23.2d, v31.2d		\n" | ||||||
| 
 | 
 | ||||||
|  | @ -261,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
| 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\ | 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\ | ||||||
| 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\ | 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\ | ||||||
| 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\ | 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\ | ||||||
| 	"	faddp	"DOTF", v0.2d			\n" | 	"	faddp	"OUT", v0.2d			\n" | ||||||
| #endif /* !defined(DOUBLE) */ | #endif /* !defined(DOUBLE) */ | ||||||
| 
 | 
 | ||||||
| #if defined(SMP) | #if defined(SMP) | ||||||
|  | @ -276,13 +272,10 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B | ||||||
| 
 | 
 | ||||||
| 	if ( n < 0 ) return dot; | 	if ( n < 0 ) return dot; | ||||||
| 
 | 
 | ||||||
|  | 	BLASLONG j = 0; | ||||||
|  | 
 | ||||||
| 	__asm__ __volatile__ ( | 	__asm__ __volatile__ ( | ||||||
| 	"	mov	"N", %[N_]			\n" | 	"	fmov	"OUT", "REG0"			\n" | ||||||
| 	"	mov	"X", %[X_]			\n" |  | ||||||
| 	"	mov	"INC_X", %[INCX_]		\n" |  | ||||||
| 	"	mov	"Y", %[Y_]			\n" |  | ||||||
| 	"	mov	"INC_Y", %[INCY_]		\n" |  | ||||||
| 	"	fmov	"DOTF", "REG0"			\n" |  | ||||||
| 	"	fmov	d1, xzr				\n" | 	"	fmov	d1, xzr				\n" | ||||||
| 	"	fmov	d2, xzr				\n" | 	"	fmov	d2, xzr				\n" | ||||||
| 	"	fmov	d3, xzr				\n" | 	"	fmov	d3, xzr				\n" | ||||||
|  | @ -290,42 +283,40 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B | ||||||
| 	"	fmov	d5, xzr				\n" | 	"	fmov	d5, xzr				\n" | ||||||
| 	"	fmov	d6, xzr				\n" | 	"	fmov	d6, xzr				\n" | ||||||
| 	"	fmov	d7, xzr				\n" | 	"	fmov	d7, xzr				\n" | ||||||
| 	"	cmp	"N", xzr			\n" | 	"	cmp	%[INCX_], #1			\n" | ||||||
| 	"	ble	9f //dot_kernel_L999		\n" |  | ||||||
| 	"	cmp	"INC_X", #1			\n" |  | ||||||
| 	"	bne	5f //dot_kernel_S_BEGIN		\n" | 	"	bne	5f //dot_kernel_S_BEGIN		\n" | ||||||
| 	"	cmp	"INC_Y", #1			\n" | 	"	cmp	%[INCY_], #1			\n" | ||||||
| 	"	bne	5f //dot_kernel_S_BEGIN		\n" | 	"	bne	5f //dot_kernel_S_BEGIN		\n" | ||||||
| 
 | 
 | ||||||
| 	"1: //dot_kernel_F_BEGIN:			\n" | 	"1: //dot_kernel_F_BEGIN:			\n" | ||||||
| 	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n" | 	"	lsl	%[INCX_], %[INCX_], "INC_SHIFT" \n" | ||||||
| 	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n" | 	"	lsl	%[INCY_], %[INCY_], "INC_SHIFT" \n" | ||||||
| 	"	asr	"J", "N", #"N_DIV_SHIFT"	\n" | 	"	asr	%[J_], %[N_], #"N_DIV_SHIFT"	\n" | ||||||
| 	"	cmp	"J", xzr			\n" | 	"	cmp	%[J_], xzr			\n" | ||||||
| 	"	beq	3f //dot_kernel_F1		\n" | 	"	beq	3f //dot_kernel_F1		\n" | ||||||
| 
 | 
 | ||||||
| 	"	.align 5				\n" | 	"	.align 5				\n" | ||||||
| 	"2: //dot_kernel_F:				\n" | 	"2: //dot_kernel_F:				\n" | ||||||
| 	"	"KERNEL_F"				\n" | 	"	"KERNEL_F"				\n" | ||||||
| 	"	subs	"J", "J", #1			\n" | 	"	subs	%[J_], %[J_], #1		\n" | ||||||
| 	"	bne	2b //dot_kernel_F		\n" | 	"	bne	2b //dot_kernel_F		\n" | ||||||
| 	"	"KERNEL_F_FINALIZE"			\n" | 	"	"KERNEL_F_FINALIZE"			\n" | ||||||
| 
 | 
 | ||||||
| 	"3: //dot_kernel_F1:				\n" | 	"3: //dot_kernel_F1:				\n" | ||||||
| 	"	ands	"J", "N", #"N_REM_MASK"		\n" | 	"	ands	%[J_], %[N_], #"N_REM_MASK"	\n" | ||||||
| 	"	ble	9f //dot_kernel_L999		\n" | 	"	ble	9f //dot_kernel_L999		\n" | ||||||
| 
 | 
 | ||||||
| 	"4: //dot_kernel_F10:				\n" | 	"4: //dot_kernel_F10:				\n" | ||||||
| 	"	"KERNEL_F1"				\n" | 	"	"KERNEL_F1"				\n" | ||||||
| 	"	subs	"J", "J", #1			\n" | 	"	subs	%[J_], %[J_], #1		\n" | ||||||
| 	"	bne	4b //dot_kernel_F10		\n" | 	"	bne	4b //dot_kernel_F10		\n" | ||||||
| 	"	b	9f //dot_kernel_L999		\n" | 	"	b	9f //dot_kernel_L999		\n" | ||||||
| 
 | 
 | ||||||
| 	"5: //dot_kernel_S_BEGIN:			\n" | 	"5: //dot_kernel_S_BEGIN:			\n" | ||||||
| 	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n" | 	"	lsl	%[INCX_], %[INCX_], "INC_SHIFT"	\n" | ||||||
| 	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n" | 	"	lsl	%[INCY_], %[INCY_], "INC_SHIFT"	\n" | ||||||
| 	"	asr	"J", "N", #2			\n" | 	"	asr	%[J_], %[N_], #2		\n" | ||||||
| 	"	cmp	"J", xzr			\n" | 	"	cmp	%[J_], xzr			\n" | ||||||
| 	"	ble	7f //dot_kernel_S1		\n" | 	"	ble	7f //dot_kernel_S1		\n" | ||||||
| 
 | 
 | ||||||
| 	"6: //dot_kernel_S4:				\n" | 	"6: //dot_kernel_S4:				\n" | ||||||
|  | @ -333,32 +324,30 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B | ||||||
| 	"	"KERNEL_F1"				\n" | 	"	"KERNEL_F1"				\n" | ||||||
| 	"	"KERNEL_F1"				\n" | 	"	"KERNEL_F1"				\n" | ||||||
| 	"	"KERNEL_F1"				\n" | 	"	"KERNEL_F1"				\n" | ||||||
| 	"	subs	"J", "J", #1			\n" | 	"	subs	%[J_], %[J_], #1		\n" | ||||||
| 	"	bne	6b //dot_kernel_S4		\n" | 	"	bne	6b //dot_kernel_S4		\n" | ||||||
| 
 | 
 | ||||||
| 	"7: //dot_kernel_S1:				\n" | 	"7: //dot_kernel_S1:				\n" | ||||||
| 	"	ands	"J", "N", #3			\n" | 	"	ands	%[J_], %[N_], #3		\n" | ||||||
| 	"	ble	9f //dot_kernel_L999		\n" | 	"	ble	9f //dot_kernel_L999		\n" | ||||||
| 
 | 
 | ||||||
| 	"8: //dot_kernel_S10:				\n" | 	"8: //dot_kernel_S10:				\n" | ||||||
| 	"	"KERNEL_F1"				\n" | 	"	"KERNEL_F1"				\n" | ||||||
| 	"	subs	"J", "J", #1			\n" | 	"	subs	%[J_], %[J_], #1		\n" | ||||||
| 	"	bne	8b //dot_kernel_S10		\n" | 	"	bne	8b //dot_kernel_S10		\n" | ||||||
| 
 | 
 | ||||||
| 	"9: //dot_kernel_L999:				\n" | 	"9: //dot_kernel_L999:				\n" | ||||||
| 	"	str	"DOTF", [%[DOT_]]		\n" |  | ||||||
| 
 | 
 | ||||||
| 	: | 	: [DOT_]  "=&w" (dot) | ||||||
| 	: [DOT_]  "r"  (&dot),		//%0
 | 	: [N_]    "r"   (n), | ||||||
| 	  [N_]    "r"  (n),		//%1
 | 	  [X_]    "r"   (x), | ||||||
| 	  [X_]    "r"  (x),		//%2
 | 	  [INCX_] "r"   (inc_x), | ||||||
| 	  [INCX_] "r"  (inc_x),		//%3
 | 	  [Y_]    "r"   (y), | ||||||
| 	  [Y_]    "r"  (y),		//%4
 | 	  [INCY_] "r"   (inc_y), | ||||||
| 	  [INCY_] "r"  (inc_y)		//%5
 |           [J_]    "r"   (j) | ||||||
| 	: "cc", | 	: "cc", | ||||||
| 	  "memory", | 	  "memory", | ||||||
| 	  "x0", "x1", "x2", "x3", "x4", "x5", | 	  "d1", "d2", "d3", "d4", "d5", "d6", "d7" | ||||||
| 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" |  | ||||||
| 	); | 	); | ||||||
| 
 | 
 | ||||||
| 	return dot; | 	return dot; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue