Merge pull request #3464 from binebrank/arm_sve_sgemm
Add sgemm part for Arm SVE
This commit is contained in:
commit
697e2752d7
|
@ -1483,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT
|
|||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
|
||||
|
||||
|
||||
ifdef STRMMUNCOPY_M
|
||||
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef STRMMLNCOPY_M
|
||||
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef STRMMUTCOPY_M
|
||||
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef STRMMLTCOPY_M
|
||||
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
@ -1809,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).
|
|||
$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
|
||||
|
||||
ifdef SSYMMUCOPY_M
|
||||
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
else
|
||||
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
endif
|
||||
|
||||
ifdef SSYMMLCOPY_M
|
||||
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
else
|
||||
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
|
||||
|
|
|
@ -114,35 +114,26 @@ DSDOTKERNEL = dot.S
|
|||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
|
|
@ -114,35 +114,27 @@ DSDOTKERNEL = dot.S
|
|||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
|
|
|
@ -0,0 +1,874 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/* X0 X1 X2 s0 X3 x4 x5 x6 */
|
||||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
|
||||
|
||||
#define origM x0
|
||||
#define origN x1
|
||||
#define origK x2
|
||||
#define origPA x3
|
||||
#define origPB x4
|
||||
#define pC x5
|
||||
#define LDC x6
|
||||
#define temp x7
|
||||
#define counterL x8
|
||||
#define counterI x9
|
||||
#define counterJ x10
|
||||
#define pB x11
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
|
||||
#define lanes x15
|
||||
#define pA x16
|
||||
#define alpha w17
|
||||
|
||||
#define alpha0 s10
|
||||
#define alphaZ z2.s
|
||||
|
||||
#define A_PRE_SIZE 1536
|
||||
#define B_PRE_SIZE 512
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
// 02 origK
|
||||
// 03 origPA
|
||||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
// 11 pB
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 lanes
|
||||
// 16 pA
|
||||
// 17
|
||||
// 18 must save
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
// 27 must save
|
||||
// 28 must save
|
||||
// 29 frame
|
||||
// 30 link
|
||||
// 31 sp
|
||||
|
||||
//v00 ALPHA -> pA0_0
|
||||
//v01 pA0_1
|
||||
//v02 ALPHA0
|
||||
//v03
|
||||
//v04
|
||||
//v05
|
||||
//v06
|
||||
//v07
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB0_4
|
||||
//v13 must save pB0_5
|
||||
//v14 must save pB0_6
|
||||
//v15 must save pB0_7
|
||||
//v16 must save C0
|
||||
//v17 must save C1
|
||||
//v18 must save C2
|
||||
//v19 must save C3
|
||||
//v20 must save C4
|
||||
//v21 must save C5
|
||||
//v22 must save C6
|
||||
//v23 must save C7
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
.macro INITv1x8
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
dup z18.s, #0
|
||||
dup z19.s, #0
|
||||
dup z20.s, #0
|
||||
dup z21.s, #0
|
||||
dup z22.s, #0
|
||||
dup z23.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_I
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
fmla z20.s, p1/m, z0.s, z12.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
fmla z21.s, p1/m, z0.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
fmla z22.s, p1/m, z0.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
fmla z23.s, p1/m, z0.s, z15.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M1
|
||||
ld1w z1.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
fmla z20.s, p1/m, z0.s, z12.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
fmla z21.s, p1/m, z0.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
fmla z22.s, p1/m, z0.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
fmla z23.s, p1/m, z0.s, z15.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M2
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
fmla z16.s, p1/m, z1.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
fmla z17.s, p1/m, z1.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
fmla z18.s, p1/m, z1.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
fmla z19.s, p1/m, z1.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
fmla z20.s, p1/m, z1.s, z12.s
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.s, p1/m, z1.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
fmla z22.s, p1/m, z1.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
fmla z23.s, p1/m, z1.s, z15.s
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_E
|
||||
fmla z16.s, p1/m, z1.s, z8.s
|
||||
fmla z17.s, p1/m, z1.s, z9.s
|
||||
fmla z18.s, p1/m, z1.s, z10.s
|
||||
fmla z19.s, p1/m, z1.s, z11.s
|
||||
fmla z20.s, p1/m, z1.s, z12.s
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.s, p1/m, z1.s, z13.s
|
||||
fmla z22.s, p1/m, z1.s, z14.s
|
||||
fmla z23.s, p1/m, z1.s, z15.s
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
fmla z20.s, p1/m, z0.s, z12.s
|
||||
fmla z21.s, p1/m, z0.s, z13.s
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z22.s, p1/m, z0.s, z14.s
|
||||
fmla z23.s, p1/m, z0.s, z15.s
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z25.s, p1/z, [pCRow1]
|
||||
fmla z25.s, p1/m, z17.s, alphaZ
|
||||
st1w z25.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z26.s, p1/z, [pCRow2]
|
||||
fmla z26.s, p1/m, z18.s, alphaZ
|
||||
st1w z26.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z27.s, p1/z, [pCRow1]
|
||||
fmla z27.s, p1/m, z19.s, alphaZ
|
||||
st1w z27.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z28.s, p1/z, [pCRow2]
|
||||
fmla z28.s, p1/m, z20.s, alphaZ
|
||||
st1w z28.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z29.s, p1/z, [pCRow1]
|
||||
fmla z29.s, p1/m, z21.s, alphaZ
|
||||
st1w z29.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z30.s, p1/z, [pCRow2]
|
||||
fmla z30.s, p1/m, z22.s, alphaZ
|
||||
st1w z30.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1w z31.s, p1/z, [pCRow1]
|
||||
fmla z31.s, p1/m, z23.s, alphaZ
|
||||
st1w z31.s, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x4
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
dup z18.s, #0
|
||||
dup z19.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
|
||||
add pB, pB, 16
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1w z25.s, p1/z, [pCRow1]
|
||||
fmla z25.s, p1/m, z17.s, alphaZ
|
||||
st1w z25.s, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1w z26.s, p1/z, [pCRow2]
|
||||
fmla z26.s, p1/m, z18.s, alphaZ
|
||||
st1w z26.s, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1w z27.s, p1/z, [pCRow1]
|
||||
fmla z27.s, p1/m, z19.s, alphaZ
|
||||
st1w z27.s, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x2
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x2_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
|
||||
add pB, pB, 8
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1w z25.s, p1/z, [pCRow1]
|
||||
fmla z25.s, p1/m, z17.s, alphaZ
|
||||
st1w z25.s, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x1
|
||||
dup z16.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x1_SUB
|
||||
ld1w z0.s, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
|
||||
add pB, pB, 4
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld1w z24.s, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaZ
|
||||
st1w z24.s, p1, [pCRow0]
|
||||
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, s0
|
||||
dup alphaZ, alpha
|
||||
|
||||
lsl LDC, LDC, #2 // ldc = ldc * 4
|
||||
ptrue p0.s // create true predicate
|
||||
|
||||
mov pB, origPB
|
||||
// Loop over N
|
||||
mov counterJ, origN
|
||||
asr counterJ, counterJ, #3 // J = J / 8
|
||||
cmp counterJ, #0
|
||||
ble .Ldgemm_kernel_L4_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat this as long as there are 8 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_BEGIN:
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #3 // add 8 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
||||
|
||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM
|
||||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x8 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
blt .Ldgemm_kernel_L8_Mv1_32
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble .Ldgemm_kernel_L8_Mv1_22a
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L8_Mv1_22
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22a:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble .Ldgemm_kernel_L8_Mv1_40
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_40:
|
||||
|
||||
INITv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L8_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_46:
|
||||
|
||||
KERNELv1x8_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L8_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
|
||||
b.any .Ldgemm_kernel_L8_Mv1_20
|
||||
|
||||
.Ldgemm_kernel_L8_END:
|
||||
|
||||
lsl temp, origK, #5
|
||||
add origPB, origPB, temp // B = B + K * 8 * 4
|
||||
|
||||
subs counterJ, counterJ , #1 // j--
|
||||
bgt .Ldgemm_kernel_L8_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 4 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #4
|
||||
ble .Ldgemm_kernel_L2_BEGIN
|
||||
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #2 // add 4 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x4 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L4_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L4_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L4_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Ldgemm_kernel_L4_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L4_END:
|
||||
lsl temp, origK, #4
|
||||
add origPB, origPB, temp // B = B + K * 4 * 4
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 2 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #2
|
||||
ble .Ldgemm_kernel_L1_BEGIN
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #1 // add 2 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x2 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L2_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L2_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L2_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L2_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x2
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Ldgemm_kernel_L2_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L2_END:
|
||||
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 1 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #1
|
||||
ble .Ldgemm_kernel_L999 // done
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC // add 1 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x1 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 8 to do?
|
||||
ble .Ldgemm_kernel_L1_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L1_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x1
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Ldgemm_kernel_L1_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L1_END:
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Ldgemm_kernel_L999:
|
||||
mov x0, #0 // set return value
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,78 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint32_t lda_vec = svindex_s32(0LL, lda);
|
||||
uint32_t sve_size = svcntw();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
|
||||
svst1_f32(pg, (float *) boffset, a_vec);
|
||||
aoffset1++;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size * lda;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
uint32_t sve_size = svcntw();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
|
||||
svst1_f32(pg, (float *) boffset, a_vec);
|
||||
aoffset1 += lda;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -44,6 +44,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
BLASLONG i, offset;
|
||||
|
||||
#if defined(DOUBLE)
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
|
@ -89,5 +90,54 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
#else
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint32_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
BLASLONG i, offset;
|
||||
|
||||
#if defined(DOUBLE)
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
|
@ -89,5 +90,54 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
#else
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint32_t gat_ind = svsel(cmp, temp2, temp1);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, one_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
do
|
||||
{
|
||||
if (X > posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao ++;
|
||||
b += n_active;
|
||||
|
@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
i ++;
|
||||
} else
|
||||
if (X < posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1(pn, ao);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
|
@ -112,9 +121,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
do
|
||||
{
|
||||
if (X < posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao ++;
|
||||
b += n_active;
|
||||
|
@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
i ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1(pn, ao);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
|
@ -111,9 +120,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
12
param.h
12
param.h
|
@ -3309,14 +3309,22 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#elif defined(ARMV8SVE) || defined(A64FX)
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
||||
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
|
||||
* Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
|
||||
* If SVE size is ever more than 1024, this should be increased also. */
|
||||
#define SGEMM_DEFAULT_UNROLL_MN 32
|
||||
|
||||
/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
||||
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_MN 32
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
|
|
Loading…
Reference in New Issue