diff --git a/Makefile.system b/Makefile.system index 76d755ec2..023546009 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1154,6 +1154,7 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) +CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S index b8d0af5c2..baf39150f 100644 --- a/kernel/arm64/daxpy_thunderx2t99.S +++ b/kernel/arm64/daxpy_thunderx2t99.S @@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, #128 .endm +/* + * No need to do software prefetches if the vector fits + * into L1 cache + */ +.macro KERNEL_F16_L1CACHE + ldp q4, q5, [X] + ldp q16, q17, [Y] + + ldp q6, q7, [X, #32] + ldp q18, q19, [Y, #32] + + fmla v16.2d, v4.2d, v0.d[0] + fmla v17.2d, v5.2d, v0.d[0] + + stp q16, q17, [Y] + + ldp q20, q21, [X, #64] + ldp q24, q25, [Y, #64] + + fmla v18.2d, v6.2d, v0.d[0] + fmla v19.2d, v7.2d, v0.d[0] + + stp q18, q19, [Y, #32] + + ldp q22, q23, [X, #96] + ldp q26, q27, [Y, #96] + + fmla v24.2d, v20.2d, v0.d[0] + fmla v25.2d, v21.2d, v0.d[0] + + stp q24, q25, [Y, #64] + + fmla v26.2d, v22.2d, v0.d[0] + fmla v27.2d, v23.2d, v0.d[0] + + stp q26, q27, [Y, #96] + + add Y, Y, #128 + add X, X, #128 +.endm + .macro KERNEL_F32 KERNEL_F16 KERNEL_F16 .endm + +.macro KERNEL_F32_L1CACHE + KERNEL_F16_L1CACHE + KERNEL_F16_L1CACHE +.endm + .macro INIT_S lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 @@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp I, xzr beq .Ldaxpy_kernel_F1 + cmp N, #2048 + ble .Ldaxpy_kernel_F32_L1CACHE + .align 5 .Ldaxpy_kernel_F32: @@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. subs I, I, #1 bne .Ldaxpy_kernel_F32 + b .Ldaxpy_kernel_F1 + + .align 5 +.Ldaxpy_kernel_F32_L1CACHE: + + KERNEL_F32_L1CACHE + + subs I, I, #1 + bne .Ldaxpy_kernel_F32_L1CACHE .Ldaxpy_kernel_F1: