Merge pull request #3425 from binebrank/arm_sve_dgemm

Add dgemm kernel for arm64 SVE
This commit is contained in:
Martin Kroeker 2021-11-26 16:14:55 +01:00 committed by GitHub
commit 454edd741c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 4742 additions and 33 deletions

View File

@ -197,3 +197,7 @@ In chronological order:
* River Dillon <oss@outerpassage.net>
* [2021-07-10] fix compilation with musl libc
* Bine Brank <https://github.com/binebrank>
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM

View File

@ -20,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a
endif
endif
ifeq ($(CORE), ARMV8SVE)
CCOMMON_OPT += -march=armv8-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a+sve
endif
endif
ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG)

View File

@ -144,6 +144,24 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS)
endif ()
endif ()
if (${CORE} STREQUAL A64FX)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif ()
if (${CORE} STREQUAL ARMV8SVE)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (NOT DYNAMIC_ARCH)
if (HAVE_AVX2)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")

View File

@ -1198,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_ARMV8SVE
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8SVE"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DARMV8SVE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "armv8sve"
#define CORENAME "ARMV8SVE"
#endif
#ifdef FORCE_ARMV8
#define FORCE
@ -1436,7 +1450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "a64fx"
#define CORENAME "A64FX"
#else

View File

@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
# symm for s and d
if (NOT DEFINED ${float_char}SYMMUCOPY_M)
set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
else ()
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
endif()
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define.
# Could simplify it a bit by pairing up by -UUNIT/-DUNIT.
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
else ()
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
endif ()
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})

View File

@ -1531,29 +1531,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N
$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
ifdef DTRMMUNCOPY_M
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef DTRMMLNCOPY_M
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
ifdef DTRMMUTCOPY_M
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
endif
ifdef DTRMMLTCOPY_M
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
else
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
endif
$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@ -1789,11 +1821,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).
$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
ifdef DSYMMUCOPY_M
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
else
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
endif
ifdef DSYMMLCOPY_M
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
else
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
endif
$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@

View File

@ -143,34 +143,28 @@ endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))

View File

@ -0,0 +1,191 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ifeq ($(SGEMM_UNROLL_M), 16)
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
else
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
endif
ifeq ($(SGEMM_UNROLL_M), 4)
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
else
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
endif
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(SGEMM_UNROLL_N), 16)
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
else
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
endif
ifeq ($(SGEMM_UNROLL_N), 4)
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
else
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -0,0 +1,874 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* X0 X1 X2 s0 X3 x4 x5 x6 */
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
#define origM x0
#define origN x1
#define origK x2
#define origPA x3
#define origPB x4
#define pC x5
#define LDC x6
#define temp x7
#define counterL x8
#define counterI x9
#define counterJ x10
#define pB x11
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define lanes x15
#define pA x16
#define alpha x17
#define alpha0 d10
#define alphaZ z2.d
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128
// 00 origM
// 01 origN
// 02 origK
// 03 origPA
// 04 origPB
// 05 pC
// 06 origLDC -> LDC
// 07 temp
// 08 counterL
// 09 counterI
// 10 counterJ
// 11 pB
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 lanes
// 16 pA
// 17
// 18 must save
// 19 must save
// 20 must save
// 21 must save
// 22 must save
// 23 must save
// 24 must save
// 25 must save
// 26 must save
// 27 must save
// 28 must save
// 29 frame
// 30 link
// 31 sp
//v00 ALPHA -> pA0_0
//v01 pA0_1
//v02 ALPHA0
//v03
//v04
//v05
//v06
//v07
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2
//v11 must save pB0_3
//v12 must save pB0_4
//v13 must save pB0_5
//v14 must save pB0_6
//v15 must save pB0_7
//v16 must save C0
//v17 must save C1
//v18 must save C2
//v19 must save C3
//v20 must save C4
//v21 must save C5
//v22 must save C6
//v23 must save C7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
.macro INITv1x8
dup z16.d, #0
dup z17.d, #0
dup z18.d, #0
dup z19.d, #0
dup z20.d, #0
dup z21.d, #0
dup z22.d, #0
dup z23.d, #0
.endm
.macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
ld1rd z12.d, p0/z, [pB, 32]
ld1rd z13.d, p0/z, [pB, 40]
ld1rd z14.d, p0/z, [pB, 48]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z0.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z0.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_M1
ld1d z1.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z0.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z0.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_M2
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
fmla z16.d, p1/m, z1.d, z8.d
ld1rd z8.d, p0/z, [pB]
fmla z17.d, p1/m, z1.d, z9.d
ld1rd z9.d, p0/z, [pB, 8]
fmla z18.d, p1/m, z1.d, z10.d
ld1rd z10.d, p0/z, [pB, 16]
fmla z19.d, p1/m, z1.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z1.d, z12.d
ld1rd z12.d, p0/z, [pB, 32]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z1.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z1.d, z15.d
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
.endm
.macro KERNELv1x8_E
fmla z16.d, p1/m, z1.d, z8.d
fmla z17.d, p1/m, z1.d, z9.d
fmla z18.d, p1/m, z1.d, z10.d
fmla z19.d, p1/m, z1.d, z11.d
fmla z20.d, p1/m, z1.d, z12.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
fmla z22.d, p1/m, z1.d, z14.d
fmla z23.d, p1/m, z1.d, z15.d
.endm
.macro KERNELv1x8_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
ld1rd z12.d, p0/z, [pB, 32]
ld1rd z13.d, p0/z, [pB, 40]
ld1rd z14.d, p0/z, [pB, 48]
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
fmla z18.d, p1/m, z0.d, z10.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z19.d, p1/m, z0.d, z11.d
fmla z20.d, p1/m, z0.d, z12.d
fmla z21.d, p1/m, z0.d, z13.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z22.d, p1/m, z0.d, z14.d
fmla z23.d, p1/m, z0.d, z15.d
.endm
.macro SAVEv1x8
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z28.d, p1/z, [pCRow2]
fmla z28.d, p1/m, z20.d, alphaZ
st1d z28.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z29.d, p1/z, [pCRow1]
fmla z29.d, p1/m, z21.d, alphaZ
st1d z29.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z30.d, p1/z, [pCRow2]
fmla z30.d, p1/m, z22.d, alphaZ
st1d z30.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z31.d, p1/z, [pCRow1]
fmla z31.d, p1/m, z23.d, alphaZ
st1d z31.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x4
dup z16.d, #0
dup z17.d, #0
dup z18.d, #0
dup z19.d, #0
.endm
.macro KERNELv1x4_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
ld1rd z10.d, p0/z, [pB, 16]
ld1rd z11.d, p0/z, [pB, 24]
add pB, pB, 32
fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z18.d, p1/m, z0.d, z10.d
fmla z19.d, p1/m, z0.d, z11.d
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x2
dup z16.d, #0
dup z17.d, #0
.endm
.macro KERNELv1x2_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
add pB, pB, 16
fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z17.d, p1/m, z0.d, z9.d
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/******************************************************************************/
.macro INITv1x1
dup z16.d, #0
.endm
.macro KERNELv1x1_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
ld1rd z8.d, p0/z, [pB]
add pB, pB, 8
fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
.align 5
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alpha, d0
dup alphaZ, alpha
lsl LDC, LDC, #3 // ldc = ldc * 8
ptrue p0.d // create true predicate
mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
/* Repeat this as long as there are 8 left in N */
.align 5
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC
add pC, pC, LDC, lsl #3 // add 8 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L8_Mv1_BEGIN:
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0
whilelt p1.d, counterI, origM
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
.align 5
.Ldgemm_kernel_L8_Mv1_20:
mov pB, origPB
INITv1x8 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt .Ldgemm_kernel_L8_Mv1_32
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #2 // subtract 2
ble .Ldgemm_kernel_L8_Mv1_22a
.align 5
.Ldgemm_kernel_L8_Mv1_22:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L8_Mv1_22
.align 5
.Ldgemm_kernel_L8_Mv1_22a:
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.align 5
.Ldgemm_kernel_L8_Mv1_32:
tst counterL, #1
ble .Ldgemm_kernel_L8_Mv1_40
KERNELv1x8_I
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_M2
KERNELv1x8_M1
KERNELv1x8_E
b .Ldgemm_kernel_L8_Mv1_44
.Ldgemm_kernel_L8_Mv1_40:
INITv1x8
.Ldgemm_kernel_L8_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L8_Mv1_100
.align 5
.Ldgemm_kernel_L8_Mv1_46:
KERNELv1x8_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L8_Mv1_46
.Ldgemm_kernel_L8_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x8
.Ldgemm_kernel_L8_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
b.any .Ldgemm_kernel_L8_Mv1_20
.Ldgemm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
subs counterJ, counterJ , #1 // j--
bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
/* Repeat the same thing if 4 left in N */
.align 5
.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #4
ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #2 // add 4 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L4_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L4_Mv1_20:
mov pB, origPB
INITv1x4 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L4_Mv1_44
.align 5
.Ldgemm_kernel_L4_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L4_Mv1_22
.Ldgemm_kernel_L4_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L4_Mv1_100
.align 5
.Ldgemm_kernel_L4_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L4_Mv1_46
.Ldgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Ldgemm_kernel_L4_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L4_Mv1_20
.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/
/* Repeat the same thing if 2 left in N */
.align 5
.Ldgemm_kernel_L2_BEGIN:
mov counterJ , origN
tst counterJ , #2
ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pC, pC, LDC, lsl #1 // add 2 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L2_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L2_Mv1_20:
mov pB, origPB
INITv1x2 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
ble .Ldgemm_kernel_L2_Mv1_44
.align 5
.Ldgemm_kernel_L2_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L2_Mv1_22
.Ldgemm_kernel_L2_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L2_Mv1_100
.align 5
.Ldgemm_kernel_L2_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
subs counterL, counterL, #1
bne .Ldgemm_kernel_L2_Mv1_46
.Ldgemm_kernel_L2_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x2
.Ldgemm_kernel_L2_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L2_Mv1_20
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
/* Repeat the same thing if 1 left in N */
.align 5
.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC
add pC, pC, LDC // add 1 x LDC
mov pA, origPA // pA = start of A array
.Ldgemm_kernel_L1_Mv1_BEGIN:
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
.align 5
.Ldgemm_kernel_L1_Mv1_20:
mov pB, origPB
INITv1x1 // fill with zeros
asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 8 to do?
ble .Ldgemm_kernel_L1_Mv1_44
.align 5
.Ldgemm_kernel_L1_Mv1_22:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_22
.Ldgemm_kernel_L1_Mv1_44:
ands counterL , origK, #7
ble .Ldgemm_kernel_L1_Mv1_100
.align 5
.Ldgemm_kernel_L1_Mv1_46:
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
subs counterL, counterL, #1
bgt .Ldgemm_kernel_L1_Mv1_46
.Ldgemm_kernel_L1_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x1
.Ldgemm_kernel_L1_Mv1_END:
incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L1_Mv1_20
.Ldgemm_kernel_L1_END:
/******************************************************************************/
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,79 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -0,0 +1,77 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,93 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, offset;
uint64_t sve_size = svcntd();
svint64_t posY_vec = svdup_s64(posY);
svint64_t posX_vec = svdup_s64(posX);
svint64_t lda_vec = svdup_s64(lda);
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
do {
offset = posX - posY;
svint64_t vec_off = svdup_s64(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint64_t temp = svadd_z(pg, posX_vec, index);
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint64_t gat_ind = svsel(cmp, temp1, temp2);
i = m;
while (i>0) {
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -0,0 +1,93 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, offset;
uint64_t sve_size = svcntd();
svint64_t posY_vec = svdup_s64(posY);
svint64_t posX_vec = svdup_s64(posX);
svint64_t lda_vec = svdup_s64(lda);
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
do {
offset = posX - posY;
svint64_t vec_off = svdup_s64(offset);
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
svint64_t temp = svadd_z(pg, posX_vec, index);
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
svint64_t gat_ind = svsel(cmp, temp2, temp1);
i = m;
while (i>0) {
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
gat_ind = svadd_m(cmp, gat_ind, one_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
svst1(pg, b, data_vec);
b += active;
offset --;
vec_off = svsub_z(pg, vec_off, one_vec);
cmp = svcmpgt(pg, vec_off, index_neg);
i--;
}
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -0,0 +1,121 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
svint64_t index = svindex_s64(0LL, lda);
FLOAT *ao;
js = 0;
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
do
{
X = posX;
if (posX <= posY) {
ao = a + posY + posX * lda;
} else {
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY) {
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
svst1(pn, b, aj_vec);
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X < posY) {
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = *(ao+k*lda+j);
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k <= j; k++) {
b[temp++] = *(ao+k*lda+j);
}
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#endif
ao += n_active;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
return 0;
}

View File

@ -0,0 +1,121 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
FLOAT *ao;
js = 0;
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
do
{
X = posX;
if (posX <= posY) {
ao = a + posY + posX * lda;
} else {
ao = a + posX + posY * lda;
}
i = 0;
do
{
if (X > posY) {
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X < posY) {
svfloat64_t aj_vec = svld1(pn, ao);
svst1(pn, b, aj_vec);
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = *(ao+j*lda+k);
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
for (int k = j; k < n_active; k++) {
b[temp++] = *(ao+j*lda+k);
}
}
#endif
ao += n_active * lda;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
return 0;
}

View File

@ -0,0 +1,121 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
svint64_t index = svindex_s64(0LL, lda);
FLOAT *ao;
js = 0;
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
do
{
X = posX;
if (posX <= posY) {
ao = a + posX + posY * lda;
} else {
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY) {
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
svst1(pn, b, aj_vec);
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X > posY) {
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = *(ao+k*lda+j);
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = ZERO;
}
for (int k = j; k < n_active; k++) {
b[temp++] = *(ao+k*lda+j);
}
}
#endif
ao += n_active;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
return 0;
}

View File

@ -0,0 +1,119 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js;
BLASLONG X;
FLOAT *ao;
js = 0;
svbool_t pn = svwhilelt_b64(js, n);
int n_active = svcntp_b64(svptrue_b64(), pn);
do
{
X = posX;
if (posX <= posY) {
ao = a + posX + posY * lda;
} else {
ao = a + posY + posX * lda;
}
i = 0;
do
{
if (X < posY) {
ao ++;
b += n_active;
X ++;
i ++;
} else
if (X > posY) {
svfloat64_t aj_vec = svld1(pn, ao);
svst1(pn, b, aj_vec);
ao += lda;
b += n_active;
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k < j; k++) {
b[temp++] = *(ao+j*lda+k);
}
b[temp++] = ONE;
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#else
int temp = 0;
for (int j = 0; j < n_active; j++) {
for (int k = 0 ; k <= j; k++) {
b[temp++] = *(ao+j*lda+k);
}
for (int k = j+1; k < n_active; k++) {
b[temp++] = ZERO;
}
}
#endif
ao += n_active * lda;
b += n_active*n_active;
X += n_active;
i += n_active;
}
} while (i < m);
posY += n_active;
js += n_active;
pn = svwhilelt_b64(js, n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
return 0;
}

36
param.h
View File

@ -3294,13 +3294,44 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#elif defined(ARMV8SVE) || defined(A64FX)
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl".
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 8
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#else /* Other/undetected ARMv8 cores */
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 8
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
@ -3325,6 +3356,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#endif /* Cores */
#endif /* ARMv8 */
#if defined(ARMV5)