Merge pull request #3425 from binebrank/arm_sve_dgemm
Add dgemm kernel for arm64 SVE
This commit is contained in:
commit
454edd741c
|
@ -197,3 +197,7 @@ In chronological order:
|
|||
|
||||
* River Dillon <oss@outerpassage.net>
|
||||
* [2021-07-10] fix compilation with musl libc
|
||||
|
||||
* Bine Brank <https://github.com/binebrank>
|
||||
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
|
||||
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
|
||||
|
|
|
@ -20,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV8SVE)
|
||||
CCOMMON_OPT += -march=armv8-a+sve
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a+sve
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
|
|
@ -144,6 +144,24 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL A64FX)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL ARMV8SVE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")
|
||||
|
|
16
getarch.c
16
getarch.c
|
@ -1198,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ARMV8SVE
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "ARMV8SVE"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DARMV8SVE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
|
||||
#define LIBNAME "armv8sve"
|
||||
#define CORENAME "ARMV8SVE"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_ARMV8
|
||||
#define FORCE
|
||||
|
@ -1436,7 +1450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
|
||||
#define LIBNAME "a64fx"
|
||||
#define CORENAME "A64FX"
|
||||
#else
|
||||
|
|
|
@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
|
||||
|
||||
# symm for s and d
|
||||
if (NOT DEFINED ${float_char}SYMMUCOPY_M)
|
||||
set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
|
||||
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
|
||||
endif()
|
||||
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
|
||||
|
||||
# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define.
|
||||
# Could simplify it a bit by pairing up by -UUNIT/-DUNIT.
|
||||
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
|
||||
if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
|
||||
set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
|
||||
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
|
||||
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
|
||||
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
|
||||
endif ()
|
||||
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
|
|
|
@ -1531,29 +1531,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N
|
|||
$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
|
||||
|
||||
ifdef DTRMMUNCOPY_M
|
||||
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DTRMMLNCOPY_M
|
||||
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DTRMMUTCOPY_M
|
||||
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DTRMMLTCOPY_M
|
||||
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
@ -1789,11 +1821,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).
|
|||
$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
|
||||
|
||||
ifdef DSYMMUCOPY_M
|
||||
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
else
|
||||
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
endif
|
||||
|
||||
ifdef DSYMMLCOPY_M
|
||||
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
else
|
||||
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
|
||||
|
|
|
@ -143,34 +143,28 @@ endif
|
|||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
|
|
|
@ -0,0 +1,191 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
@ -0,0 +1,874 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/* X0 X1 X2 s0 X3 x4 x5 x6 */
|
||||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
|
||||
|
||||
#define origM x0
|
||||
#define origN x1
|
||||
#define origK x2
|
||||
#define origPA x3
|
||||
#define origPB x4
|
||||
#define pC x5
|
||||
#define LDC x6
|
||||
#define temp x7
|
||||
#define counterL x8
|
||||
#define counterI x9
|
||||
#define counterJ x10
|
||||
#define pB x11
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
|
||||
#define lanes x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaZ z2.d
|
||||
|
||||
#define A_PRE_SIZE 1536
|
||||
#define B_PRE_SIZE 512
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
// 02 origK
|
||||
// 03 origPA
|
||||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
// 11 pB
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 lanes
|
||||
// 16 pA
|
||||
// 17
|
||||
// 18 must save
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
// 27 must save
|
||||
// 28 must save
|
||||
// 29 frame
|
||||
// 30 link
|
||||
// 31 sp
|
||||
|
||||
//v00 ALPHA -> pA0_0
|
||||
//v01 pA0_1
|
||||
//v02 ALPHA0
|
||||
//v03
|
||||
//v04
|
||||
//v05
|
||||
//v06
|
||||
//v07
|
||||
//v08 must save pB0_0
|
||||
//v09 must save pB0_1
|
||||
//v10 must save pB0_2
|
||||
//v11 must save pB0_3
|
||||
//v12 must save pB0_4
|
||||
//v13 must save pB0_5
|
||||
//v14 must save pB0_6
|
||||
//v15 must save pB0_7
|
||||
//v16 must save C0
|
||||
//v17 must save C1
|
||||
//v18 must save C2
|
||||
//v19 must save C3
|
||||
//v20 must save C4
|
||||
//v21 must save C5
|
||||
//v22 must save C6
|
||||
//v23 must save C7
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
.macro INITv1x8
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
dup z18.d, #0
|
||||
dup z19.d, #0
|
||||
dup z20.d, #0
|
||||
dup z21.d, #0
|
||||
dup z22.d, #0
|
||||
dup z23.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_I
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M1
|
||||
ld1d z1.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_M2
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
fmla z16.d, p1/m, z1.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
fmla z17.d, p1/m, z1.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
fmla z18.d, p1/m, z1.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
fmla z19.d, p1/m, z1.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
fmla z20.d, p1/m, z1.d, z12.d
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.d, p1/m, z1.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
fmla z22.d, p1/m, z1.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
fmla z23.d, p1/m, z1.d, z15.d
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_E
|
||||
fmla z16.d, p1/m, z1.d, z8.d
|
||||
fmla z17.d, p1/m, z1.d, z9.d
|
||||
fmla z18.d, p1/m, z1.d, z10.d
|
||||
fmla z19.d, p1/m, z1.d, z11.d
|
||||
fmla z20.d, p1/m, z1.d, z12.d
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z21.d, p1/m, z1.d, z13.d
|
||||
fmla z22.d, p1/m, z1.d, z14.d
|
||||
fmla z23.d, p1/m, z1.d, z15.d
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x8_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z26.d, p1/z, [pCRow2]
|
||||
fmla z26.d, p1/m, z18.d, alphaZ
|
||||
st1d z26.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z27.d, p1/z, [pCRow1]
|
||||
fmla z27.d, p1/m, z19.d, alphaZ
|
||||
st1d z27.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z28.d, p1/z, [pCRow2]
|
||||
fmla z28.d, p1/m, z20.d, alphaZ
|
||||
st1d z28.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z29.d, p1/z, [pCRow1]
|
||||
fmla z29.d, p1/m, z21.d, alphaZ
|
||||
st1d z29.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z30.d, p1/z, [pCRow2]
|
||||
fmla z30.d, p1/m, z22.d, alphaZ
|
||||
st1d z30.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1d z31.d, p1/z, [pCRow1]
|
||||
fmla z31.d, p1/m, z23.d, alphaZ
|
||||
st1d z31.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x4
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
dup z18.d, #0
|
||||
dup z19.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
ld1d z26.d, p1/z, [pCRow2]
|
||||
fmla z26.d, p1/m, z18.d, alphaZ
|
||||
st1d z26.d, p1, [pCRow2]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1d z27.d, p1/z, [pCRow1]
|
||||
fmla z27.d, p1/m, z19.d, alphaZ
|
||||
st1d z27.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x2
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x2_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
||||
add pB, pB, 16
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld1d z25.d, p1/z, [pCRow1]
|
||||
fmla z25.d, p1/m, z17.d, alphaZ
|
||||
st1d z25.d, p1, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INITv1x1
|
||||
dup z16.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x1_SUB
|
||||
ld1d z0.d, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
|
||||
add pB, pB, 8
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld1d z24.d, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaZ
|
||||
st1d z24.d, p1, [pCRow0]
|
||||
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alpha, d0
|
||||
dup alphaZ, alpha
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
ptrue p0.d // create true predicate
|
||||
|
||||
mov pB, origPB
|
||||
// Loop over N
|
||||
mov counterJ, origN
|
||||
asr counterJ, counterJ, #3 // J = J / 8
|
||||
cmp counterJ, #0
|
||||
ble .Ldgemm_kernel_L4_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat this as long as there are 8 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_BEGIN:
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #3 // add 8 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_BEGIN:
|
||||
|
||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x8 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
blt .Ldgemm_kernel_L8_Mv1_32
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble .Ldgemm_kernel_L8_Mv1_22a
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L8_Mv1_22
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_22a:
|
||||
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble .Ldgemm_kernel_L8_Mv1_40
|
||||
|
||||
KERNELv1x8_I
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_M2
|
||||
KERNELv1x8_M1
|
||||
KERNELv1x8_E
|
||||
|
||||
|
||||
b .Ldgemm_kernel_L8_Mv1_44
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_40:
|
||||
|
||||
INITv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L8_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L8_Mv1_46:
|
||||
|
||||
KERNELv1x8_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L8_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x8
|
||||
|
||||
.Ldgemm_kernel_L8_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
b.any .Ldgemm_kernel_L8_Mv1_20
|
||||
|
||||
.Ldgemm_kernel_L8_END:
|
||||
|
||||
lsl temp, origK, #6
|
||||
add origPB, origPB, temp // B = B + K * 8 * 8
|
||||
|
||||
subs counterJ, counterJ , #1 // j--
|
||||
bgt .Ldgemm_kernel_L8_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 4 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #4
|
||||
ble .Ldgemm_kernel_L2_BEGIN
|
||||
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #2 // add 4 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x4 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L4_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L4_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L4_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L4_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Ldgemm_kernel_L4_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L4_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L4_END:
|
||||
lsl temp, origK, #5
|
||||
add origPB, origPB, temp // B = B + K * 4 * 8
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 2 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #2
|
||||
ble .Ldgemm_kernel_L1_BEGIN
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC, lsl #1 // add 2 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x2 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 4 to do?
|
||||
ble .Ldgemm_kernel_L2_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L2_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L2_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L2_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Ldgemm_kernel_L2_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x2
|
||||
|
||||
.Ldgemm_kernel_L2_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L2_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L2_END:
|
||||
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
|
||||
|
||||
/******************************************************************************/
|
||||
/* Repeat the same thing if 1 left in N */
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #1
|
||||
ble .Ldgemm_kernel_L999 // done
|
||||
|
||||
mov pCRow0, pC
|
||||
|
||||
add pC, pC, LDC // add 1 x LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x1 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3 // L = K / 8
|
||||
cmp counterL , #0 // is there at least 8 to do?
|
||||
ble .Ldgemm_kernel_L1_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_22:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_22
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Ldgemm_kernel_L1_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Ldgemm_kernel_L1_Mv1_46:
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Ldgemm_kernel_L1_Mv1_46
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x1
|
||||
|
||||
.Ldgemm_kernel_L1_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Ldgemm_kernel_L1_Mv1_20
|
||||
|
||||
|
||||
.Ldgemm_kernel_L1_END:
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Ldgemm_kernel_L999:
|
||||
mov x0, #0 // set return value
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,79 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint64_t lda_vec = svindex_s64(0LL, lda);
|
||||
uint64_t sve_size = svcntd();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
|
||||
svst1_f64(pg, (double *) boffset, a_vec);
|
||||
aoffset1++;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size * lda;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
uint64_t sve_size = svcntd();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
|
||||
svst1_f64(pg, (double *) boffset, a_vec);
|
||||
aoffset1 += lda;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,93 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, offset;
|
||||
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint64_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, offset;
|
||||
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
|
||||
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
|
||||
svint64_t gat_ind = svsel(cmp, temp2, temp1);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, one_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
|
||||
svst1(pg, b, data_vec);
|
||||
|
||||
b += active;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posY + posX * lda;
|
||||
} else {
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY) {
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
svst1(pn, b, aj_vec);
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X < posY) {
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k <= j; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posY + posX * lda;
|
||||
} else {
|
||||
ao = a + posX + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY) {
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X < posY) {
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
svst1(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
for (int k = j; k < n_active; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * lda;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posX + posY * lda;
|
||||
} else {
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY) {
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
svst1(pn, b, aj_vec);
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
for (int k = j; k < n_active; k++) {
|
||||
b[temp++] = *(ao+k*lda+j);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posX + posY * lda;
|
||||
} else {
|
||||
ao = a + posY + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY) {
|
||||
ao ++;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
svst1(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k <= j; k++) {
|
||||
b[temp++] = *(ao+j*lda+k);
|
||||
}
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * lda;
|
||||
b += n_active*n_active;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
|
||||
return 0;
|
||||
}
|
36
param.h
36
param.h
|
@ -3294,13 +3294,44 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#elif defined(ARMV8SVE) || defined(A64FX)
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl".
|
||||
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 160
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q 352
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 224
|
||||
#define ZGEMM_DEFAULT_Q 112
|
||||
|
||||
#define SGEMM_DEFAULT_R 4096
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#else /* Other/undetected ARMv8 cores */
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -3325,6 +3356,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#endif /* Cores */
|
||||
|
||||
|
||||
#endif /* ARMv8 */
|
||||
|
||||
#if defined(ARMV5)
|
||||
|
|
Loading…
Reference in New Issue