diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 38d7ff8f1..ccbce27e1 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -160,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c @@ -184,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index a8dafe862..bc5999097 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,66 +1 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRMMUNCOPY_M = -CTRMMLNCOPY_M = -CTRMMUTCOPY_M = -CTRMMLTCOPY_M = -CHEMMLTCOPY_M = -CHEMMUTCOPY_M = -CSYMMUCOPY_M = -CSYMMLCOPY_M = - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -else -CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMCOPYLN_M = -ZTRSMCOPYLT_M = -ZTRSMCOPYUN_M = -ZTRSMCOPYUT_M = - -ZTRMMUNCOPY_M = -ZTRMMLNCOPY_M = -ZTRMMUTCOPY_M = -ZTRMMLTCOPY_M = -ZHEMMLTCOPY_M = -ZHEMMUTCOPY_M = -ZSYMMUCOPY_M = -ZSYMMLCOPY_M = - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -else -ZGEMMINCOPYOBJ = -ZGEMMITCOPYOBJ = -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S index 38770f66b..2136ebbee 100644 --- a/kernel/arm64/cgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S @@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rw z15.s, p0/z, [pB, 28] add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.s, p1/m, z2.s, z15.s ld1rw z15.s, p0/z, [pB, 28] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 32 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z3.s, z15.s OP_ri z23.s, p1/m, z2.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z1.s, z15.s OP_ri z23.s, p1/m, z0.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld2w {z28.s, z29.s}, p1/z, [pCRow2] fmla z28.s, p1/m, z20.s, alphaz_R fmls z28.s, p1/m, z21.s, alphaz_I @@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.s, p1/m, z23.s, alphaz_R st2w {z30.s, z31.s}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, s0 dup alphaz_R, alphaR fmov alphaI, s1 @@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lcgemm_kernel_L4_Mv1_46 .Lcgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lcgemm_kernel_L4_Mv1_END: diff --git a/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c new file mode 100644 index 000000000..90f867b44 --- /dev/null +++ b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_INDEX svuint64_t +#define SV_INDEXER svindex_u64 +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_INDEX svuint32_t +#define SV_INDEXER svindex_u32 +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ + svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ + a_offset_inner += 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + uint64_t sve_size; + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); + SV_TYPE a_vec_real; + SV_TYPE a_vec_imag; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * lda * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + diff --git a/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c new file mode 100644 index 000000000..975166a2e --- /dev/null +++ b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64x2_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32x2_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld2(pg, a_offset_inner); \ + svst2(pg, b_offset, a_vec); \ + a_offset_inner += lda * 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + uint64_t sve_size = svcntw(); + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + + diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index d5b35775c..a043948d6 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.d, p1/m, z2.d, z15.d ld1rd z15.d, p0/z, [pB, 56] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 64 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z3.d, z14.d OP_ii z22.d, p1/m, z3.d, z15.d OP_ri z23.d, p1/m, z2.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z1.d, z14.d OP_ii z22.d, p1/m, z1.d, z15.d OP_ri z23.d, p1/m, z0.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2d {z28.d, z29.d}, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaz_R @@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.d, p1/m, z23.d, alphaz_R st2d {z30.d, z31.d}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, d0 dup alphaz_R, alphaR fmov alphaI, d1 @@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lzgemm_kernel_L4_Mv1_46 .Lzgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lzgemm_kernel_L4_Mv1_END: diff --git a/param.h b/param.h index 84e0c2ac7..8fb4bcc48 100644 --- a/param.h +++ b/param.h @@ -3385,11 +3385,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define DGEMM_DEFAULT_UNROLL_M 4 // Actually 2VL (8) but kept separate to keep copies separate #define DGEMM_DEFAULT_UNROLL_N 8 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160