diff --git a/.travis.yml b/.travis.yml index 4a25e7121..a0af0472e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -85,8 +85,8 @@ jobs: sudo: true language: minimal before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ - && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } install: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' diff --git a/Makefile.rule b/Makefile.rule index 8c651412e..6522b0777 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -152,6 +152,9 @@ NO_AFFINITY = 1 # FUNCTION_PROFILE = 1 # Support for IEEE quad precision(it's *real* REAL*16)( under testing) +# This option should not be used - it is a holdover from unfinished code present +# in the original GotoBLAS2 library that may be usable as a starting point but +# is not even expected to compile in its present form. # QUAD_PRECISION = 1 # Theads are still working for a while after finishing BLAS operation diff --git a/c_check b/c_check index 66acf1cad..9dc237beb 100644 --- a/c_check +++ b/c_check @@ -205,7 +205,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "int main(void){ __asm__ volatile($code); }\n"; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 1446a900d..38d59f956 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,6 +3,11 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. +if (INTERFACE64) + set(SUFFIX64 64) + set(SUFFIX64_UNDERSCORE _64) +endif() + if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index ca88a6d5f..df4b2ab06 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,4 +1,5 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ @@ -6,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas +Libs: -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d339a755f..fe30c7600 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -67,7 +67,7 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") diff --git a/driver/others/memory.c b/driver/others/memory.c index 0019253c0..4a8e6c067 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2587,20 +2587,20 @@ void *blas_memory_alloc(int procpos){ position = 0; + LOCK_COMMAND(&alloc_lock); do { /* if (!memory[position].used) { */ - LOCK_COMMAND(&alloc_lock); /* blas_lock(&memory[position].lock);*/ if (!memory[position].used) goto allocation; - UNLOCK_COMMAND(&alloc_lock); /* blas_unlock(&memory[position].lock);*/ /* } */ position ++; } while (position < NUM_BUFFERS); + UNLOCK_COMMAND(&alloc_lock); goto error; diff --git a/exports/Makefile b/exports/Makefile index 29075a9c2..3a5f77db3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -114,9 +114,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c index ebeb103e7..0dde33ae3 100644 --- a/interface/lapack/laswp.c +++ b/interface/lapack/laswp.c @@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, - laswp[flag], nthreads); + (int(*)())laswp[flag], nthreads); } #endif diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c index 31e08451d..b77a40985 100644 --- a/interface/lapack/zlaswp.c +++ b/interface/lapack/zlaswp.c @@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads); } #endif diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index 5b08e5028..9a75885a2 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -58,11 +58,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -82,22 +82,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -107,7 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X @@ -118,11 +118,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -133,7 +133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -142,22 +142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X @@ -184,11 +184,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -196,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vadd.f64 d1 , d1, d7 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 @@ -212,11 +212,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 @@ -226,28 +226,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 -d5 } + vldmia.f64 X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 @@ -273,22 +273,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 vadd.f32 s0 , s0, s6 vadd.f32 s1 , s1, s7 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 @@ -300,11 +300,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 @@ -313,28 +313,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 -s5 } + vldmia.f32 X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index c35b8aece..39c9ac233 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -146,17 +146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } fmacd d9 , d0, d5 - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d9 } fmacd d10, d0, d6 - fstmiad Y!, { d10 } + vstmia.f64 Y!, { d10 } fmacd d11, d0, d7 - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d11 } .endm @@ -164,19 +164,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y , { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y!, { d8 } + vstmia.f64 Y!, { d8 } .endm .macro KERNEL_S1 - fldmiad X , { d4 } - fldmiad Y , { d8 } + vldmia.f64 X , { d4 } + vldmia.f64 Y , { d8 } fmacd d8 , d0, d4 - fstmiad Y , { d8 } + vstmia.f64 Y , { d8 } add X, X, INC_X add Y, Y, INC_Y @@ -186,16 +186,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } fmacs s9 , s0, s5 - fstmias Y!, { s9 } + vstmia.f32 Y!, { s9 } fmacs s10, s0, s6 - fstmias Y!, { s10 } + vstmia.f32 Y!, { s10 } fmacs s11, s0, s7 - fstmias Y!, { s11 } + vstmia.f32 Y!, { s11 } .endm @@ -203,19 +203,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y , { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y!, { s8 } + vstmia.f32 Y!, { s8 } .endm .macro KERNEL_S1 - fldmias X , { s4 } - fldmias Y , { s8 } + vldmia.f32 X , { s4 } + vldmia.f32 Y , { s8 } fmacs s8 , s0, s4 - fstmias Y , { s8 } + vstmia.f32 Y , { s8 } add X, X, INC_X add Y, Y, INC_Y @@ -231,42 +231,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } pld [ X, #X_PRE ] - fldmiad X!, { d4 - d7 } + vldmia.f64 X!, { d4 - d7 } pld [ Y, #X_PRE ] - fldmiad Y , { d8 - d11 } + vldmia.f64 Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 - fstmiad Y!, { d10 } - fstmiad Y!, { d11 } + vstmia.f64 Y!, { d10 } + vstmia.f64 Y!, { d11 } @@ -277,15 +277,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y!, { d8 } - fstmiad Y!, { d9 } + vstmia.f64 Y!, { d8 } + vstmia.f64 Y!, { d9 } @@ -293,14 +293,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X , { d4 - d5 } - fldmiad Y , { d8 - d9 } + vldmia.f64 X , { d4 - d5 } + vldmia.f64 Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 - fstmiad Y , { d8 - d9 } + vstmia.f64 Y , { d8 - d9 } add X, X, INC_X add Y, Y, INC_Y @@ -314,40 +314,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmias X!, { s4 - s7 } + vldmia.f32 X!, { s4 - s7 } pld [ Y, #X_PRE ] - fldmias Y , { s8 - s11 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } - fldmias X!, { s4 - s7 } - fldmias Y , { s8 - s11 } + vldmia.f32 X!, { s4 - s7 } + vldmia.f32 Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 - fstmias Y!, { s10 } - fstmias Y!, { s11 } + vstmia.f32 Y!, { s10 } + vstmia.f32 Y!, { s11 } @@ -358,15 +358,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y!, { s8 } - fstmias Y!, { s9 } + vstmia.f32 Y!, { s8 } + vstmia.f32 Y!, { s9 } @@ -374,14 +374,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X , { s4 - s5 } - fldmias Y , { s8 - s9 } + vldmia.f32 X , { s4 - s5 } + vldmia.f32 Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 - fstmias Y , { s8 - s9 } + vstmia.f32 Y , { s8 - s9 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S index 874fcab9c..fbb32b43c 100644 --- a/kernel/arm/ccopy_vfp.S +++ b/kernel/arm/ccopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmias X!, { s0 - s7 } - fstmias Y!, { s0 - s7 } + vldmia.f32 X!, { s0 - s7 } + vstmia.f32 Y!, { s0 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 - s1 } - fstmias Y!, { s0 - s1 } + vldmia.f32 X!, { s0 - s1 } + vstmia.f32 Y!, { s0 - s1 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s2 - s3 } - fstmias Y, { s2 - s3 } + vldmia.f32 X, { s2 - s3 } + vstmia.f32 Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 - s1 } - fstmias Y, { s0 - s1 } + vldmia.f32 X, { s0 - s1 } + vstmia.f32 Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index fd86a37b0..85246d734 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -76,30 +76,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 fmacs s3 , s7, s10 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 - fldmias X!, { s6 - s7 } + vldmia.f32 X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 - fldmias Y!, { s10 - s11 } + vldmia.f32 Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } - fldmias Y!, { s8 - s9 } + vldmia.f32 X!, { s4 - s5 } + vldmia.f32 Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -125,8 +125,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -134,8 +134,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -152,8 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 @@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s8 - s9 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index 71bc50efd..d2591919e 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -329,9 +329,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } - fldmias CO2, { s4 - s7 } + vldmia.f32 CO2, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -343,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -500,23 +500,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } - fldmias CO2, { s4 - s5 } + vldmia.f32 CO2, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -671,7 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -683,7 +683,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -800,14 +800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 9d473ad78..5ebc904ac 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -182,30 +182,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -218,17 +218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -250,19 +250,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -300,16 +300,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } - fldmias CO2, { s8 - s11 } + vldmia.f32 CO1, { s4 - s7 } + vldmia.f32 CO2, { s8 - s11 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -370,8 +370,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -534,8 +534,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } - fldmias CO2, { s8 - s9 } + vldmia.f32 CO1, { s4 - s5 } + vldmia.f32 CO2, { s8 - s9 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -552,8 +552,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -716,7 +716,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s7 } + vldmia.f32 CO1, { s4 - s7 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -733,7 +733,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -851,7 +851,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias CO1, { s4 - s5 } + vldmia.f32 CO1, { s4 - s5 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 @@ -861,7 +861,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S index 29eeab492..fe4959988 100644 --- a/kernel/arm/cgemm_ncopy_2_vfp.S +++ b/kernel/arm/cgemm_ncopy_2_vfp.S @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s6 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -111,7 +111,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/cgemm_tcopy_2_vfp.S b/kernel/arm/cgemm_tcopy_2_vfp.S index 9036b994d..7b3ae18d4 100644 --- a/kernel/arm/cgemm_tcopy_2_vfp.S +++ b/kernel/arm/cgemm_tcopy_2_vfp.S @@ -73,12 +73,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ .macro COPY2x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -86,12 +86,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 -s1 } + vldmia.f32 AO1, { s0 -s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -100,9 +100,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -110,9 +110,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 62ee33bb9..d6b18c796 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -201,7 +201,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -213,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -266,14 +266,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, #8 @@ -349,47 +349,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -430,14 +430,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index c07b6d6f8..6833df7d1 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -150,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -180,7 +180,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -204,8 +204,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO! , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -216,14 +216,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -249,9 +249,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } - fldmias AO2!, { s8 - s9 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } + vldmia.f32 AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -269,25 +269,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y - fldmias YO, { s6 - s7 } + vldmia.f32 YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias YO, { s6 - s7 } + vstmia.f32 YO, { s6 - s7 } add YO, YO, INC_Y @@ -313,8 +313,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 - s3 } - fldmias AO1!, { s4 - s5 } + vldmia.f32 XO , { s2 - s3 } + vldmia.f32 AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 @@ -327,14 +327,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias YO, { s4 - s5 } + vstmia.f32 YO, { s4 - s5 } add YO, YO, INC_Y diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index aae890ea9..ca1a512fb 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -165,9 +165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmuls s8 , s0, s4 @@ -197,9 +197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } pld [ BO, #B_PRE ] - fldmias BO!, { s4 - s7 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -225,8 +225,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M2 - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -254,8 +254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s7 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 - fstmias CO2, { s4 - s7 } + vstmia.f32 CO2, { s4 - s7 } add CO1, CO1, #16 @@ -513,7 +513,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } flds s4, FP_ZERO vmov.f32 s5, s4 @@ -523,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 - fstmias CO2, { s4 - s5 } + vstmia.f32 CO2, { s4 - s5 } add CO1, CO1, #8 @@ -693,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -818,7 +818,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 79e7ed07f..d75fb7735 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -170,30 +170,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 @@ -206,17 +206,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 @@ -238,19 +238,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 @@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB - fldmias AO!, { s0 - s1 } - fldmias BO!, { s8 - s9 } + vldmia.f32 AO!, { s0 - s1 } + vldmia.f32 BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 @@ -354,8 +354,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 - fstmias CO1, { s4 - s7 } - fstmias CO2, { s8 - s11 } + vstmia.f32 CO1, { s4 - s7 } + vstmia.f32 CO2, { s8 - s11 } add CO1, CO1, #16 @@ -532,8 +532,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 - fstmias CO1, { s4 - s5 } - fstmias CO2, { s8 - s9 } + vstmia.f32 CO1, { s4 - s5 } + vstmia.f32 CO2, { s8 - s9 } add CO1, CO1, #8 @@ -710,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 - fstmias CO1, { s4 - s7 } + vstmia.f32 CO1, { s4 - s7 } add CO1, CO1, #16 @@ -835,7 +835,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 - fstmias CO1, { s4 - s5 } + vstmia.f32 CO1, { s4 - s5 } add CO1, CO1, #8 diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S index da239924a..7ee52af88 100644 --- a/kernel/arm/dcopy_vfp.S +++ b/kernel/arm/dcopy_vfp.S @@ -65,15 +65,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F4 pld [ X, #X_PRE ] - fldmiad X!, { d0 - d3 } - fstmiad Y!, { d0 - d3 } + vldmia.f64 X!, { d0 - d3 } + vstmia.f64 Y!, { d0 - d3 } .endm .macro COPY_F1 - fldmiad X!, { d0 } - fstmiad Y!, { d0 } + vldmia.f64 X!, { d0 } + vstmia.f64 Y!, { d0 } .endm @@ -83,23 +83,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d1 } - fstmiad Y, { d1 } + vldmia.f64 X, { d1 } + vstmia.f64 Y, { d1 } add X, X, INC_X add Y, Y, INC_Y @@ -108,8 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 } - fstmiad Y, { d0 } + vldmia.f64 X, { d0 } + vstmia.f64 Y, { d0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index cc2e485b7..4dff5a3e1 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -67,26 +67,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X!, { d8 } + vldmia.f64 X!, { d8 } pld [ Y, #X_PRE ] - fldmiad Y!, { d4 } - fldmiad Y!, { d5 } + vldmia.f64 Y!, { d4 } + vldmia.f64 Y!, { d5 } fmacd d0 , d4, d8 - fldmiad X!, { d9 } - fldmiad Y!, { d6 } + vldmia.f64 X!, { d9 } + vldmia.f64 Y!, { d6 } fmacd d1 , d5, d9 - fldmiad X!, { d10 } - fldmiad X!, { d11 } + vldmia.f64 X!, { d10 } + vldmia.f64 X!, { d11 } fmacd d0 , d6, d10 - fldmiad Y!, { d7 } + vldmia.f64 Y!, { d7 } fmacd d1 , d7, d11 .endm .macro KERNEL_F1 - fldmiad X!, { d4 } - fldmiad Y!, { d8 } + vldmia.f64 X!, { d4 } + vldmia.f64 Y!, { d8 } fmacd d0 , d4, d8 .endm @@ -97,26 +97,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d4, d8 - fldmiad X, { d5 } - fldmiad Y, { d9 } + vldmia.f64 X, { d5 } + vldmia.f64 Y, { d9 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d5, d9 - fldmiad X, { d6 } - fldmiad Y, { d10 } + vldmia.f64 X, { d6 } + vldmia.f64 Y, { d10 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d6, d10 - fldmiad X, { d7 } - fldmiad Y, { d11 } + vldmia.f64 X, { d7 } + vldmia.f64 Y, { d11 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d7, d11 @@ -126,8 +126,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d8 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d8 } add X, X, INC_X fmacd d0 , d4, d8 add Y, Y, INC_Y diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 1744b54d8..d852c2dad 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -331,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r4 , CO2, r3 pld [ CO2 , #C_PRE ] - fldmiad CO1, { d8 - d11 } + vldmia.f64 CO1, { d8 - d11 } pld [ r4 , #C_PRE ] fmacd d8 , d0 , d16 @@ -352,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d15, d0 , d23 fstd d11, [CO1, #24 ] - fldmiad r4, { d8 - d11 } + vldmia.f64 r4, { d8 - d11 } fmacd d8 , d0 , d24 fstd d12, [CO2] @@ -367,7 +367,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO2 , #C_PRE ] - fldmiad CO2, { d12 - d15 } + vldmia.f64 CO2, { d12 - d15 } fstd d8 , [r4 ] fmacd d12, d0 , d28 @@ -378,7 +378,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmacd d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S index 6266c61d2..9642b6478 100644 --- a/kernel/arm/dgemm_ncopy_2_vfp.S +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S index ffc19a9cc..5760cbd8a 100644 --- a/kernel/arm/dgemm_ncopy_4_vfp.S +++ b/kernel/arm/dgemm_ncopy_4_vfp.S @@ -105,10 +105,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d11, [ AO4, #16 ] fldd d15, [ AO4, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #32 - fstmiad BO!, { d4 - d7 } - fstmiad BO!, { d8 - d15 } + vstmia.f64 BO!, { d4 - d7 } + vstmia.f64 BO!, { d8 - d15 } .endm @@ -122,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO4, #0 ] add AO3, AO3, #8 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO4, AO4, #8 .endm @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d5 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO2, AO2, #8 .endm @@ -164,7 +164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -174,7 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] - fstmiad BO!, { d0 } + vstmia.f64 BO!, { d0 } add AO1, AO1, #8 .endm diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S index 937f43957..8335de27c 100644 --- a/kernel/arm/dgemm_tcopy_4_vfp.S +++ b/kernel/arm/dgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d8 - d11 } + vldmia.f64 r3, { d8 - d11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d12 - d15 } + vldmia.f64 r3, { d12 - d15 } - fstmiad BO1, { d0 - d15 } + vstmia.f64 BO1, { d0 - d15 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } add r3, r3, LDA - fldmiad r3, { d4 - d5 } + vldmia.f64 r3, { d4 - d5 } add r3, r3, LDA - fldmiad r3, { d6 - d7 } + vldmia.f64 r3, { d6 - d7 } - fstmiad BO2, { d0 - d7 } + vstmia.f64 BO2, { d0 - d7 } add AO1, AO1, #16 add BO2, BO2, #64 @@ -117,18 +117,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } add r3, r3, LDA - fldmiad r3, { d2 } + vldmia.f64 r3, { d2 } add r3, r3, LDA - fldmiad r3, { d3 } + vldmia.f64 r3, { d3 } - fstmiad BO3, { d0 - d3 } + vstmia.f64 BO3, { d0 - d3 } add AO1, AO1, #8 add BO3, BO3, #32 @@ -139,13 +139,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -153,12 +153,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -166,12 +166,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } add r3, AO1, LDA - fldmiad r3, { d1 } + vldmia.f64 r3, { d1 } - fstmiad BO3, { d0 - d1 } + vstmia.f64 BO3, { d0 - d1 } add AO1, AO1, #8 add BO3, BO3, #16 @@ -182,9 +182,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -192,9 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 @@ -202,9 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 } + vldmia.f64 AO1, { d0 } - fstmiad BO3, { d0 } + vstmia.f64 BO3, { d0 } add AO1, AO1, #8 add BO3, BO3, #8 diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c0c6a1677..e73936cdd 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -128,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmuld d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmuld d17 , d1, d8 fldd d9 , [ BO, #8 ] fmuld d18 , d2, d8 @@ -148,10 +148,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuld d23 , d3, d9 fmuld d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmuld d25 , d1, d10 fmuld d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -173,10 +173,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] pld [ AO , #A_PRE ] - fldmiad AO!, { d0 - d1} + vldmia.f64 AO!, { d0 - d1} fmacd d16 , d0, d8 - fldmiad AO!, { d2 - d3} + vldmia.f64 AO!, { d2 - d3} fmacd d17 , d1, d8 fldd d9 , [ BO, #8 ] fmacd d18 , d2, d8 @@ -193,10 +193,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d23 , d3, d9 fmacd d24 , d0, d10 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d25 , d1, d10 fmacd d26 , d2, d10 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d27 , d3, d10 fldd d13, [ BO, #8 ] @@ -225,11 +225,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [ BO ] fmacd d21 , d5, d13 fmacd d22 , d6, d13 - fldmiad AO!, { d0 - d1 } + vldmia.f64 AO!, { d0 - d1 } fmacd d23 , d7, d13 fmacd d24 , d4, d14 - fldmiad AO!, { d2 - d3 } + vldmia.f64 AO!, { d2 - d3 } fmacd d25 , d5, d14 fldd d9 , [ BO, #8 ] fmacd d26 , d6, d14 @@ -257,10 +257,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacd d19 , d3, d8 fmacd d20 , d0, d9 - fldmiad AO!, { d4 - d5 } + vldmia.f64 AO!, { d4 - d5 } fmacd d21 , d1, d9 fmacd d22 , d2, d9 - fldmiad AO!, { d6 - d7 } + vldmia.f64 AO!, { d6 - d7 } fmacd d23 , d3, d9 fmacd d24 , d0, d10 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [r4 , #24 ] fmuld d15, d0 , d31 - fstmiad CO2, { d12 - d15 } + vstmia.f64 CO2, { d12 - d15 } add CO1, CO1, #32 diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 7c154d741..753ac27c6 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -139,8 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmiad XO! , { d2 } - fldmiad AO1 , { d4 - d7 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d4 - d7 } vmla.f64 d8 , d2 , d4 pld [ AO2 , #4*SIZE ] @@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d11 , d2 , d7 - fldmiad r3 , { d4 - d7 } + vldmia.f64 r3 , { d4 - d7 } vmla.f64 d12 , d2 , d4 vmla.f64 d13 , d2 , d5 @@ -164,23 +164,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d8 vmla.f64 d5 , d0, d9 vmla.f64 d6 , d0, d10 vmla.f64 d7 , d0, d11 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } vmla.f64 d4 , d0, d12 vmla.f64 d5 , d0, d13 vmla.f64 d6 , d0, d14 vmla.f64 d7 , d0, d15 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -195,8 +195,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -204,9 +204,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 pld [ AO2 , #A_PRE ] - fldmiad XO , { d2 } - fldmiad AO1 , { d8 - d11 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 - d11 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA @@ -249,24 +249,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d12 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d13 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4 , d0, d14 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5 , d0, d15 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -282,8 +282,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1 , { d8 } vmla.f64 d12 , d2 , d8 add AO1, AO1, LDA add XO, XO , INC_X @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d12 - fstmiad YO , { d4 } + vstmia.f64 YO , { d4 } add YO, YO, INC_Y .endm @@ -338,8 +338,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2, #A_PRE ] - fldmias XO! , { s2 } - fldmias AO1 , { s4 - s7 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s4 - s7 } vmla.f32 s8 , s2 , s4 vmla.f32 s9 , s2 , s5 @@ -348,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r3, AO1, #4*SIZE - fldmias r3 , { s4 - s7 } + vldmia.f32 r3 , { s4 - s7 } vmla.f32 s12 , s2 , s4 vmla.f32 s13 , s2 , s5 @@ -362,24 +362,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s8 vmla.f32 s5 , s0, s9 vmla.f32 s6 , s0, s10 vmla.f32 s7 , s0, s11 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } - fldmias YO, { s4 - s7 } + vldmia.f32 YO, { s4 - s7 } vmla.f32 s4 , s0, s12 vmla.f32 s5 , s0, s13 vmla.f32 s6 , s0, s14 vmla.f32 s7 , s0, s15 - fstmias YO!, { s4 - s7 } + vstmia.f32 YO!, { s4 - s7 } .endm @@ -394,8 +394,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA @@ -403,9 +403,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 - s11 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 - s11 } vmla.f32 s12 , s2 , s8 vmla.f32 s13 , s2 , s9 @@ -449,24 +449,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S4 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s12 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s13 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4 , s0, s14 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5 , s0, s15 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -482,8 +482,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1 , { s8 } vmla.f32 s12 , s2 , s8 add AO1, AO1, LDA add XO, XO , INC_X @@ -492,9 +492,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s12 - fstmias YO , { s4 } + vstmia.f32 YO , { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 54f958b7b..e80dc1458 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -138,8 +138,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 pld [ AO2 , #A_PRE ] @@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmiad YO, { d16 - d23 } + vldmia.f64 YO, { d16 - d23 } vmla.f64 d16, d0, d24 vmla.f64 d17, d0, d25 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f64 d22, d0, d30 vmla.f64 d23, d0, d31 - fstmiad YO!, { d16 - d23 } + vstmia.f64 YO!, { d16 - d23 } .endm @@ -184,8 +184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO! , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA @@ -193,9 +193,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO!, { d16 } + vstmia.f64 YO!, { d16 } .endm @@ -234,8 +234,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO2 , #A_PRE ] pld [ AO2 , #A_PRE+32 ] - fldmiad XO , { d4 } - fldmiad AO1 , { d8 - d15 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 - d15 } vmla.f64 d24 , d4 , d8 vmla.f64 d25 , d4 , d9 @@ -253,44 +253,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y - fldmiad YO, { d17 } + vldmia.f64 YO, { d17 } vmla.f64 d17, d0, d25 - fstmiad YO, { d17 } + vstmia.f64 YO, { d17 } add YO, YO, INC_Y - fldmiad YO, { d18 } + vldmia.f64 YO, { d18 } vmla.f64 d18, d0, d26 - fstmiad YO, { d18 } + vstmia.f64 YO, { d18 } add YO, YO, INC_Y - fldmiad YO, { d19 } + vldmia.f64 YO, { d19 } vmla.f64 d19, d0, d27 - fstmiad YO, { d19 } + vstmia.f64 YO, { d19 } add YO, YO, INC_Y - fldmiad YO, { d20 } + vldmia.f64 YO, { d20 } vmla.f64 d20, d0, d28 - fstmiad YO, { d20 } + vstmia.f64 YO, { d20 } add YO, YO, INC_Y - fldmiad YO, { d21 } + vldmia.f64 YO, { d21 } vmla.f64 d21, d0, d29 - fstmiad YO, { d21 } + vstmia.f64 YO, { d21 } add YO, YO, INC_Y - fldmiad YO, { d22 } + vldmia.f64 YO, { d22 } vmla.f64 d22, d0, d30 - fstmiad YO, { d22 } + vstmia.f64 YO, { d22 } add YO, YO, INC_Y - fldmiad YO, { d23 } + vldmia.f64 YO, { d23 } vmla.f64 d23, d0, d31 - fstmiad YO, { d23 } + vstmia.f64 YO, { d23 } add YO, YO, INC_Y .endm @@ -306,8 +306,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d4 } - fldmiad AO1 , { d8 } + vldmia.f64 XO , { d4 } + vldmia.f64 AO1 , { d8 } vmla.f64 d24 , d4 , d8 add AO1, AO1, LDA add XO, XO, INC_X @@ -316,9 +316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d16 } + vldmia.f64 YO, { d16 } vmla.f64 d16, d0, d24 - fstmiad YO, { d16 } + vstmia.f64 YO, { d16 } add YO, YO, INC_Y .endm @@ -361,8 +361,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F8X1 pld [ AO2 , #A_PRE ] - fldmias XO! , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -379,7 +379,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F8 - fldmias YO, { s16 - s23 } + vldmia.f32 YO, { s16 - s23 } vmla.f32 s16, s0, s24 vmla.f32 s17, s0, s25 @@ -390,7 +390,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmla.f32 s22, s0, s30 vmla.f32 s23, s0, s31 - fstmias YO!, { s16 - s23 } + vstmia.f32 YO!, { s16 - s23 } .endm @@ -405,8 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO! , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA @@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO!, { s16 } + vstmia.f32 YO!, { s16 } .endm @@ -454,8 +454,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S8X1 pld [ AO2 , #A_PRE ] - fldmias XO , { s4 } - fldmias AO1 , { s8 - s15 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 - s15 } vmla.f32 s24 , s4 , s8 vmla.f32 s25 , s4 , s9 @@ -473,44 +473,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S8 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y - fldmias YO, { s17 } + vldmia.f32 YO, { s17 } vmla.f32 s17, s0, s25 - fstmias YO, { s17 } + vstmia.f32 YO, { s17 } add YO, YO, INC_Y - fldmias YO, { s18 } + vldmia.f32 YO, { s18 } vmla.f32 s18, s0, s26 - fstmias YO, { s18 } + vstmia.f32 YO, { s18 } add YO, YO, INC_Y - fldmias YO, { s19 } + vldmia.f32 YO, { s19 } vmla.f32 s19, s0, s27 - fstmias YO, { s19 } + vstmia.f32 YO, { s19 } add YO, YO, INC_Y - fldmias YO, { s20 } + vldmia.f32 YO, { s20 } vmla.f32 s20, s0, s28 - fstmias YO, { s20 } + vstmia.f32 YO, { s20 } add YO, YO, INC_Y - fldmias YO, { s21 } + vldmia.f32 YO, { s21 } vmla.f32 s21, s0, s29 - fstmias YO, { s21 } + vstmia.f32 YO, { s21 } add YO, YO, INC_Y - fldmias YO, { s22 } + vldmia.f32 YO, { s22 } vmla.f32 s22, s0, s30 - fstmias YO, { s22 } + vstmia.f32 YO, { s22 } add YO, YO, INC_Y - fldmias YO, { s23 } + vldmia.f32 YO, { s23 } vmla.f32 s23, s0, s31 - fstmias YO, { s23 } + vstmia.f32 YO, { s23 } add YO, YO, INC_Y .endm @@ -526,8 +526,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s4 } - fldmias AO1 , { s8 } + vldmia.f32 XO , { s4 } + vldmia.f32 AO1 , { s8 } vmla.f32 s24 , s4 , s8 add AO1, AO1, LDA add XO, XO, INC_X @@ -536,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s16 } + vldmia.f32 YO, { s16 } vmla.f32 s16, s0, s24 - fstmias YO, { s16 } + vstmia.f32 YO, { s16 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 9559d1829..fbe51cc8c 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -112,13 +112,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO2!, { d4 - d5 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } vmla.f64 d2 , d12 , d8 vmla.f64 d3 , d12 , d4 @@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 vmla.f64 d3 , d1 , d4 @@ -143,10 +143,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } vmla.f64 d4, d0, d2 vmla.f64 d5, d0, d3 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -160,10 +160,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d12 - d15 } + vldmia.f64 XO! , { d12 - d15 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d2 , d12 , d8 vmla.f64 d2 , d13 , d9 vmla.f64 d2 , d14, d10 @@ -173,17 +173,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO!, { d4 } + vstmia.f64 YO!, { d4 } .endm @@ -197,23 +197,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d4 - d5 } + vldmia.f64 AO2!, { d4 - d5 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } - fldmiad AO2!, { d6 - d7 } + vldmia.f64 AO1!, { d10 - d11 } + vldmia.f64 AO2!, { d6 - d7 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -229,9 +229,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d4 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d4 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X vmla.f64 d3 , d1 , d4 @@ -240,14 +240,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y - fldmiad YO, { d5 } + vldmia.f64 YO, { d5 } vmla.f64 d5, d0, d3 - fstmiad YO, { d5 } + vstmia.f64 YO, { d5 } add YO, YO, INC_Y .endm @@ -261,20 +261,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmiad XO , { d12 } + vldmia.f64 XO , { d12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } - fldmiad XO , { d13 } + vldmia.f64 XO , { d13 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } - fldmiad XO , { d14 } + vldmia.f64 XO , { d14 } add XO, XO, INC_X - fldmiad XO , { d15 } + vldmia.f64 XO , { d15 } add XO, XO, INC_X vmla.f64 d2 , d12 , d8 @@ -286,8 +286,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d1 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d1 } + vldmia.f64 AO1!, { d8 } vmla.f64 d2 , d1 , d8 add XO, XO, INC_X @@ -295,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 } + vldmia.f64 YO, { d4 } vmla.f64 d4, d0, d2 - fstmiad YO, { d4 } + vstmia.f64 YO, { d4 } add YO, YO, INC_Y .endm @@ -315,11 +315,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } vmla.f32 s2 , s12 , s8 vmla.f32 s3 , s12 , s4 @@ -334,9 +334,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 vmla.f32 s3 , s1 , s4 @@ -344,10 +344,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s4 - s5 } + vldmia.f32 YO, { s4 - s5 } vmla.f32 s4, s0, s2 vmla.f32 s5, s0, s3 - fstmias YO!, { s4 - s5 } + vstmia.f32 YO!, { s4 - s5 } .endm @@ -359,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s12 - s15 } - fldmias AO1!, { s8 - s9 } - fldmias AO1!, { s10 - s11 } + vldmia.f32 XO! , { s12 - s15 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s2 , s12 , s8 vmla.f32 s2 , s13 , s9 vmla.f32 s2 , s14, s10 @@ -371,17 +371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 .endm .macro SAVE_F1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO!, { s4 } + vstmia.f32 YO!, { s4 } .endm @@ -395,21 +395,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s4 - s5 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s4 - s5 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } - fldmias AO2!, { s6 - s7 } + vldmia.f32 AO1!, { s10 - s11 } + vldmia.f32 AO2!, { s6 - s7 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -425,9 +425,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } - fldmias AO2!, { s4 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s4 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X vmla.f32 s3 , s1 , s4 @@ -436,14 +436,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y - fldmias YO, { s5 } + vldmia.f32 YO, { s5 } vmla.f32 s5, s0, s3 - fstmias YO, { s5 } + vstmia.f32 YO, { s5 } add YO, YO, INC_Y .endm @@ -456,20 +456,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s12 } + vldmia.f32 XO , { s12 } add XO, XO, INC_X pld [ AO1 , #A_PRE ] - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } - fldmias XO , { s13 } + vldmia.f32 XO , { s13 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } - fldmias XO , { s14 } + vldmia.f32 XO , { s14 } add XO, XO, INC_X - fldmias XO , { s15 } + vldmia.f32 XO , { s15 } add XO, XO, INC_X vmla.f32 s2 , s12 , s8 @@ -481,8 +481,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s1 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s1 } + vldmia.f32 AO1!, { s8 } vmla.f32 s2 , s1 , s8 add XO, XO, INC_X @@ -490,9 +490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s4 } + vldmia.f32 YO, { s4 } vmla.f32 s4, s0, s2 - fstmias YO, { s4 } + vstmia.f32 YO, { s4 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index b1d3dadf1..a88d70016 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -108,17 +108,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -129,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } - fldmiad AO2!, { d16 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -139,10 +139,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d24 - d25 } + vldmia.f64 YO, { d24 - d25 } vmla.f64 d24, d0, d4 vmla.f64 d25, d0, d5 - fstmiad YO!, { d24 - d25 } + vstmia.f64 YO!, { d24 - d25 } .endm @@ -156,23 +156,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } pld [ AO2 , #A_PRE ] - fldmiad AO2!, { d16 - d17 } + vldmia.f64 AO2!, { d16 - d17 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X vmla.f64 d5 , d28 , d16 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d5 , d29 , d17 - fldmiad AO2!, { d18 - d19 } + vldmia.f64 AO2!, { d18 - d19 } vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d5 , d30, d18 vmla.f64 d4 , d31, d11 @@ -183,10 +183,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X - fldmiad AO2!, { d16 } + vldmia.f64 AO2!, { d16 } vmla.f64 d4 , d2 , d8 vmla.f64 d5 , d2 , d16 @@ -194,14 +194,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d5 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 pld [ XO , #X_PRE ] - fldmiad XO! , { d28 - d31 } + vldmia.f64 XO! , { d28 - d31 } pld [ AO1 , #A_PRE ] - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 vmla.f64 d4 , d30, d10 vmla.f64 d4 , d31, d11 @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO! , { d2 } + vldmia.f64 AO1!, { d8 } vmla.f64 d4 , d2 , d8 .endm .macro SAVE_F1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO!, { d24 } + vstmia.f64 YO!, { d24 } .endm @@ -252,18 +252,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 pld [ AO1 , #A_PRE ] - fldmiad XO , { d28 } + vldmia.f64 XO , { d28 } add XO, XO, INC_X - fldmiad AO1!, { d8 - d9 } + vldmia.f64 AO1!, { d8 - d9 } vmla.f64 d4 , d28 , d8 - fldmiad XO , { d29 } + vldmia.f64 XO , { d29 } add XO, XO, INC_X - fldmiad AO1!, { d10 - d11 } + vldmia.f64 AO1!, { d10 - d11 } vmla.f64 d4 , d29 , d9 - fldmiad XO , { d30 } + vldmia.f64 XO , { d30 } add XO, XO, INC_X vmla.f64 d4 , d30, d10 - fldmiad XO , { d31 } + vldmia.f64 XO , { d31 } add XO, XO, INC_X vmla.f64 d4 , d31, d11 @@ -272,8 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 } - fldmiad AO1!, { d8 } + vldmia.f64 XO , { d2 } + vldmia.f64 AO1!, { d8 } add XO, XO, INC_X vmla.f64 d4 , d2 , d8 @@ -281,9 +281,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d24 } + vldmia.f64 YO, { d24 } vmla.f64 d24, d0, d4 - fstmiad YO, { d24 } + vstmia.f64 YO, { d24 } add YO, YO, INC_Y .endm @@ -300,15 +300,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -319,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } - fldmias AO2!, { s16 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -329,10 +329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmias YO, { s24 - s25 } + vldmia.f32 YO, { s24 - s25 } vmla.f32 s24, s0, s4 vmla.f32 s25, s0, s5 - fstmias YO!, { s24 - s25 } + vstmia.f32 YO!, { s24 - s25 } .endm @@ -345,22 +345,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } - fldmias AO2!, { s16 - s17 } + vldmia.f32 AO1!, { s8 - s9 } + vldmia.f32 AO2!, { s16 - s17 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X vmla.f32 s5 , s28 , s16 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s5 , s29 , s17 - fldmias AO2!, { s18 - s19 } + vldmia.f32 AO2!, { s18 - s19 } vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s5 , s30, s18 vmla.f32 s4 , s31, s11 @@ -371,10 +371,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X - fldmias AO2!, { s16 } + vldmia.f32 AO2!, { s16 } vmla.f32 s4 , s2 , s8 vmla.f32 s5 , s2 , s16 @@ -382,14 +382,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s5 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm @@ -402,10 +402,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X4 - fldmias XO! , { s28 - s31 } - fldmias AO1!, { s8 - s9 } + vldmia.f32 XO! , { s28 - s31 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 vmla.f32 s4 , s30, s10 vmla.f32 s4 , s31, s11 @@ -415,17 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmias XO! , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO! , { s2 } + vldmia.f32 AO1!, { s8 } vmla.f32 s4 , s2 , s8 .endm .macro SAVE_F1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO!, { s24 } + vstmia.f32 YO!, { s24 } .endm @@ -437,18 +437,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X4 - fldmias XO , { s28 } + vldmia.f32 XO , { s28 } add XO, XO, INC_X - fldmias AO1!, { s8 - s9 } + vldmia.f32 AO1!, { s8 - s9 } vmla.f32 s4 , s28 , s8 - fldmias XO , { s29 } + vldmia.f32 XO , { s29 } add XO, XO, INC_X - fldmias AO1!, { s10 - s11 } + vldmia.f32 AO1!, { s10 - s11 } vmla.f32 s4 , s29 , s9 - fldmias XO , { s30 } + vldmia.f32 XO , { s30 } add XO, XO, INC_X vmla.f32 s4 , s30, s10 - fldmias XO , { s31 } + vldmia.f32 XO , { s31 } add XO, XO, INC_X vmla.f32 s4 , s31, s11 @@ -457,8 +457,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmias XO , { s2 } - fldmias AO1!, { s8 } + vldmia.f32 XO , { s2 } + vldmia.f32 AO1!, { s8 } add XO, XO, INC_X vmla.f32 s4 , s2 , s8 @@ -466,9 +466,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmias YO, { s24 } + vldmia.f32 YO, { s24 } vmla.f32 s24, s0, s4 - fstmias YO, { s24 } + vstmia.f32 YO, { s24 } add YO, YO, INC_Y .endm diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index fab05c9c8..fd43b15b1 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -114,7 +114,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 } + vldmia.f64 X!, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 } + vldmia.f64 X, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z @@ -146,7 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 @@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 } + vldmia.f32 X!, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -182,7 +182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 } + vldmia.f32 X, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z @@ -193,7 +193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 @@ -215,7 +215,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmiad X!, { d0 -d1 } + vldmia.f64 X!, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -227,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -241,7 +241,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmiad X, { d0 -d1 } + vldmia.f64 X, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 @@ -272,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F - fldmias X!, { s0 -s1 } + vldmia.f32 X!, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -284,7 +284,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S - fldmias X, { s0 -s1 } + vldmia.f32 X, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 @@ -312,7 +312,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index 16ac5a632..8e0937851 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 84977901d..7be1e977e 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 } + vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -95,7 +95,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -121,7 +121,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmias X!, { s4 } + vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ @@ -158,7 +158,7 @@ KERNEL_F1_NEXT_\@: .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT @@ -191,7 +191,7 @@ KERNEL_S1_NEXT: .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } + vldmia.f64 X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -249,7 +249,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -294,7 +294,7 @@ KERNEL_S1_END_\@: .macro KERNEL_F1 - fldmias X!, { s4 - s5 } + vldmia.f32 X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr @@ -350,7 +350,7 @@ KERNEL_F1_END_\@: .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index ea296dbc5..6aec06205 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -77,68 +77,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } - fldmiad Y, { d5 } + vldmia.f64 X, { d4 } + vldmia.f64 Y, { d5 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 vmls.f64 d3 , d1, d4 - fstmiad X, { d2 } - fstmiad Y, { d3 } + vstmia.f64 X, { d2 } + vstmia.f64 Y, { d3 } add X, X, INC_X add Y, Y, INC_Y @@ -149,68 +149,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s5 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s5 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 vmls.f32 s3 , s1, s4 - fstmias X, { s2 } - fstmias Y, { s3 } + vstmia.f32 X, { s2 } + vstmia.f32 Y, { s3 } add X, X, INC_X add Y, Y, INC_Y @@ -230,96 +230,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 vmls.f64 d3 , d1, d4 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 vmls.f64 d3 , d1, d5 - fstmiad X!, { d2 } - fstmiad Y!, { d3 } + vstmia.f64 X!, { d2 } + vstmia.f64 Y!, { d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d6 - d7 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d6 - d7 } vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 @@ -347,96 +347,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 vmls.f32 s3 , s1, s4 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 vmls.f32 s3 , s1, s5 - fstmias X!, { s2 } - fstmias Y!, { s3 } + vstmia.f32 X!, { s2 } + vstmia.f32 Y!, { s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } - fldmias Y, { s6 - s7 } + vldmia.f32 X, { s4 - s5 } + vldmia.f32 Y, { s6 - s7 } vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index cc3e3b98d..8992c35a8 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -64,30 +64,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 pld [ X, #X_PRE ] - fldmiad X, { d4 - d7 } + vldmia.f64 X, { d4 - d7 } vmul.f64 d4, d4, d0 vmul.f64 d5, d5, d0 vmul.f64 d6, d6, d0 - fstmiad X!, { d4 - d5 } + vstmia.f64 X!, { d4 - d5 } vmul.f64 d7, d7, d0 - fstmiad X!, { d6 - d7 } + vstmia.f64 X!, { d6 - d7 } .endm .macro KERNEL_F1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X!, { d4 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d4 } + vldmia.f64 X, { d4 } vmul.f64 d4, d4, d0 - fstmiad X, { d4 } + vstmia.f64 X, { d4 } add X, X, INC_X .endm @@ -96,30 +96,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s4 - s7 } + vldmia.f32 X, { s4 - s7 } vmul.f32 s4, s4, s0 vmul.f32 s5, s5, s0 vmul.f32 s6, s6, s0 - fstmias X!, { s4 - s5 } + vstmia.f32 X!, { s4 - s5 } vmul.f32 s7, s7, s0 - fstmias X!, { s6 - s7 } + vstmia.f32 X!, { s6 - s7 } .endm .macro KERNEL_F1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X!, { s4 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s4 } + vldmia.f32 X, { s4 } vmul.f32 s4, s4, s0 - fstmias X, { s4 } + vstmia.f32 X, { s4 } add X, X, INC_X .endm @@ -136,58 +136,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } pld [ X, #X_PRE ] - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_F1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X!, { d2 - d3 } + vstmia.f64 X!, { d2 - d3 } .endm .macro KERNEL_S1 - fldmiad X, { d4 - d5 } + vldmia.f64 X, { d4 - d5 } vmul.f64 d2, d0, d4 vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 - fstmiad X, { d2 - d3 } + vstmia.f64 X, { d2 - d3 } add X, X, INC_X .endm @@ -199,56 +199,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_F1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X!, { s2 - s3 } + vstmia.f32 X!, { s2 - s3 } .endm .macro KERNEL_S1 - fldmias X, { s4 - s5 } + vldmia.f32 X, { s4 - s5 } vmul.f32 s2, s0, s4 vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 - fstmias X, { s2 - s3 } + vstmia.f32 X, { s2 - s3 } add X, X, INC_X .endm diff --git a/kernel/arm/scopy_vfp.S b/kernel/arm/scopy_vfp.S index 0fd815db8..1ccd29c95 100644 --- a/kernel/arm/scopy_vfp.S +++ b/kernel/arm/scopy_vfp.S @@ -65,17 +65,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_F8 pld [ X, #X_PRE ] - fldmias X!, { s0 - s3 } - fldmias X!, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias Y!, { s4 - s7 } + vldmia.f32 X!, { s0 - s3 } + vldmia.f32 X!, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 Y!, { s4 - s7 } .endm .macro COPY_F1 - fldmias X!, { s0 } - fstmias Y!, { s0 } + vldmia.f32 X!, { s0 } + vstmia.f32 Y!, { s0 } .endm @@ -85,23 +85,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s1 } - fstmias Y, { s1 } + vldmia.f32 X, { s1 } + vstmia.f32 Y, { s1 } add X, X, INC_X add Y, Y, INC_Y @@ -110,8 +110,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmias X, { s0 } - fstmias Y, { s0 } + vldmia.f32 X, { s0 } + vstmia.f32 Y, { s0 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index 544846258..bb374b5ee 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -68,26 +68,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -96,8 +96,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s14 } - fldmias Y!, { s15 } + vldmia.f32 X!, { s14 } + vldmia.f32 Y!, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -109,32 +109,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 add X, X, INC_X add Y, Y, INC_Y - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -146,8 +146,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s14 } - fldmias Y, { s15 } + vldmia.f32 X, { s14 } + vldmia.f32 Y, { s15 } vmul.f32 s15, s14, s15 vcvt.f64.f32 d4, s15 vadd.f64 d0 , d0, d4 @@ -162,12 +162,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X!, { s8 - s9 } - fldmias Y!, { s4 - s5} + vldmia.f32 X!, { s8 - s9 } + vldmia.f32 Y!, { s4 - s5} fmacs s0 , s4, s8 - fldmias X!, { s10 - s11 } + vldmia.f32 X!, { s10 - s11 } fmacs s1 , s5, s9 - fldmias Y!, { s6 - s7 } + vldmia.f32 Y!, { s6 - s7 } fmacs s0 , s6, s10 fmacs s1 , s7, s11 @@ -175,8 +175,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmias X!, { s4 } - fldmias Y!, { s8 } + vldmia.f32 X!, { s4 } + vldmia.f32 Y!, { s8 } fmacs s0 , s4, s8 .endm @@ -185,26 +185,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S4 nop - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s4, s8 - fldmias X, { s5 } - fldmias Y, { s9 } + vldmia.f32 X, { s5 } + vldmia.f32 Y, { s9 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s5, s9 - fldmias X, { s6 } - fldmias Y, { s10 } + vldmia.f32 X, { s6 } + vldmia.f32 Y, { s10 } add X, X, INC_X add Y, Y, INC_Y fmacs s0 , s6, s10 - fldmias X, { s7 } - fldmias Y, { s11 } + vldmia.f32 X, { s7 } + vldmia.f32 Y, { s11 } add X, X, INC_X add Y, Y, INC_Y fmacs s1 , s7, s11 @@ -214,8 +214,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmias X, { s4 } - fldmias Y, { s8 } + vldmia.f32 X, { s4 } + vldmia.f32 Y, { s8 } add X, X, INC_X fmacs s0 , s4, s8 add Y, Y, INC_Y diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index 1f21e5a1f..c072f4126 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -112,8 +112,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO! , { s0 - s3 } - fldmias BO! , { s4 - s5 } + vldmia.f32 AO! , { s0 - s3 } + vldmia.f32 BO! , { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 6491d3571..789643f56 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -136,29 +136,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I pld [ AO , #A_PRE ] - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ BO , #B_PRE ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -174,20 +174,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s3 } + vldmia.f32 AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias BO!, { s8 - s11 } + vldmia.f32 BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - //fldmias AO!, { s2 - s3 } + //vldmia.f32 AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - //fldmias BO!, { s10 - s11 } + //vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -203,17 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s7 } + vldmia.f32 AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias BO!, { s12 - s15 } - //fldmias AO!, { s6 - s7 } + vldmia.f32 BO!, { s12 - s15 } + //vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 fmacs s21 , s1, s9 fmacs s22 , s2, s9 - //fldmias BO!, { s14 - s15 } + //vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -300,7 +300,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA add r4 , CO2, r3 - fldmias CO1, { s8 - s11 } + vldmia.f32 CO1, { s8 - s11 } fmacs s8 , s0 , s16 flds s12, [CO2] @@ -322,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO1 , #C_PRE ] - fldmias r4, { s8 - s11 } + vldmia.f32 r4, { s8 - s11 } fmacs s8 , s0 , s24 fsts s12, [CO2] @@ -338,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add CO2, r4 , r3 - fldmias CO2, { s12 - s15 } + vldmia.f32 CO2, { s12 - s15 } fsts s8 , [r4 ] fmacs s12, s0 , s28 @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s15, s0 , s31 pld [ r4 , #C_PRE ] - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } pld [ CO2 , #C_PRE ] add CO1, CO1, #16 diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S index ff4ff0845..dd4596602 100644 --- a/kernel/arm/sgemm_ncopy_2_vfp.S +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -73,7 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO2, #4 ] add AO1, AO1, #8 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO2, AO2, #8 .endm @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO1, AO1, #8 .endm @@ -105,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/sgemm_ncopy_4_vfp.S b/kernel/arm/sgemm_ncopy_4_vfp.S index ab013134e..dbcea5961 100644 --- a/kernel/arm/sgemm_ncopy_4_vfp.S +++ b/kernel/arm/sgemm_ncopy_4_vfp.S @@ -100,10 +100,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s11, [ AO4, #8 ] flds s15, [ AO4, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #16 - fstmias BO!, { s4 - s7 } - fstmias BO!, { s8 - s15 } + vstmia.f32 BO!, { s4 - s7 } + vstmia.f32 BO!, { s8 - s15 } .endm @@ -117,7 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s3 , [ AO4, #0 ] add AO3, AO3, #4 - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO4, AO4, #4 .endm @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s5 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] - fstmias BO!, { s0 - s7 } + vstmia.f32 BO!, { s0 - s7 } add AO2, AO2, #16 .endm @@ -147,7 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s1 , [ AO2, #0 ] add AO1, AO1, #4 - fstmias BO!, { s0 - s1 } + vstmia.f32 BO!, { s0 - s1 } add AO2, AO2, #4 .endm @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] - fstmias BO!, { s0 - s3 } + vstmia.f32 BO!, { s0 - s3 } add AO1, AO1, #16 .endm @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0 , [ AO1, #0 ] - fstmias BO!, { s0 } + vstmia.f32 BO!, { s0 } add AO1, AO1, #4 .endm diff --git a/kernel/arm/sgemm_tcopy_4_vfp.S b/kernel/arm/sgemm_tcopy_4_vfp.S index 9bb0e46b1..e61613c5c 100644 --- a/kernel/arm/sgemm_tcopy_4_vfp.S +++ b/kernel/arm/sgemm_tcopy_4_vfp.S @@ -76,21 +76,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_1 pld [ AO1, #A_PRE ] - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA pld [ r3, #A_PRE ] - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -98,18 +98,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4_2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } add r3, r3, LDA - fldmias r3, { s8 - s11 } + vldmia.f32 r3, { s8 - s11 } add r3, r3, LDA - fldmias r3, { s12 - s15 } + vldmia.f32 r3, { s12 - s15 } - fstmias BO1, { s0 - s15 } + vstmia.f32 BO1, { s0 - s15 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -118,18 +118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } add r3, r3, LDA - fldmias r3, { s4 - s5 } + vldmia.f32 r3, { s4 - s5 } add r3, r3, LDA - fldmias r3, { s6 - s7 } + vldmia.f32 r3, { s6 - s7 } - fstmias BO2, { s0 - s7 } + vstmia.f32 BO2, { s0 - s7 } add AO1, AO1, #8 add BO2, BO2, #32 @@ -137,18 +137,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x4 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } add r3, r3, LDA - fldmias r3, { s2 } + vldmia.f32 r3, { s2 } add r3, r3, LDA - fldmias r3, { s3 } + vldmia.f32 r3, { s3 } - fstmias BO3, { s0 - s3 } + vstmia.f32 BO3, { s0 - s3 } add AO1, AO1, #4 add BO3, BO3, #16 @@ -158,12 +158,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } add r3, AO1, LDA - fldmias r3, { s4 - s7 } + vldmia.f32 r3, { s4 - s7 } - fstmias BO1, { s0 - s7 } + vstmia.f32 BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -171,12 +171,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } add r3, AO1, LDA - fldmias r3, { s2 - s3 } + vldmia.f32 r3, { s2 - s3 } - fstmias BO2, { s0 - s3 } + vstmia.f32 BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 @@ -184,12 +184,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } add r3, AO1, LDA - fldmias r3, { s1 } + vldmia.f32 r3, { s1 } - fstmias BO3, { s0 - s1 } + vstmia.f32 BO3, { s0 - s1 } add AO1, AO1, #4 add BO3, BO3, #8 @@ -199,9 +199,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x1 - fldmias AO1, { s0 - s3 } + vldmia.f32 AO1, { s0 - s3 } - fstmias BO1, { s0 - s3 } + vstmia.f32 BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 @@ -209,9 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x1 - fldmias AO1, { s0 - s1 } + vldmia.f32 AO1, { s0 - s1 } - fstmias BO2, { s0 - s1 } + vstmia.f32 BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 @@ -219,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmias AO1, { s0 } + vldmia.f32 AO1, { s0 } - fstmias BO3, { s0 } + vstmia.f32 BO3, { s0 } add AO1, AO1, #4 add BO3, BO3, #4 diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 635b1dd13..34fa0ee39 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -118,8 +118,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB - fldmias AO!, { s0 - s3 } - fldmias BO!, { s4 - s5 } + vldmia.f32 AO!, { s0 - s3 } + vldmia.f32 BO!, { s4 - s5 } fmacs s8 , s0, s4 fmacs s9 , s1, s4 diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index e24d24eba..0f601d5b8 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -122,30 +122,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } pld [ AO , #A_PRE-8 ] - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } pld [ BO , #B_PRE-8 ] fmuls s16 , s0, s8 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 @@ -161,20 +161,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 - fldmias AO!, { s0 - s1 } + vldmia.f32 AO!, { s0 - s1 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 - fldmias AO!, { s2 - s3 } + vldmia.f32 AO!, { s2 - s3 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 - fldmias BO!, { s8 - s9 } + vldmia.f32 BO!, { s8 - s9 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 - fldmias BO!, { s10 - s11 } + vldmia.f32 BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 @@ -190,17 +190,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 fmacs s16 , s0, s8 - fldmias AO!, { s4 - s5 } + vldmia.f32 AO!, { s4 - s5 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 - fldmias AO!, { s6 - s7 } + vldmia.f32 AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 - fldmias BO!, { s12 - s13 } + vldmia.f32 BO!, { s12 - s13 } fmacs s21 , s1, s9 fmacs s22 , s2, s9 - fldmias BO!, { s14 - s15 } + vldmia.f32 BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 @@ -325,7 +325,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [r4 , #12 ] fmuls s15, s0 , s31 - fstmias CO2, { s12 - s15 } + vstmia.f32 CO2, { s12 - s15 } add CO1, CO1, #16 diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 76661da79..0b3d98912 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -103,29 +103,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y!, { d0 } - fstmiad X!, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y!, { d0 } + vstmia.f64 X!, { d4 } .endm .macro KERNEL_S1 - fldmiad X, { d0 } - fldmiad Y, { d4 } - fstmiad Y, { d0 } - fstmiad X, { d4 } + vldmia.f64 X, { d0 } + vldmia.f64 Y, { d4 } + vstmia.f64 Y, { d0 } + vstmia.f64 X, { d4 } add X, X, INC_X add Y, Y, INC_Y @@ -135,29 +135,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y!, { s0 } - fstmias X!, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y!, { s0 } + vstmia.f32 X!, { s4 } .endm .macro KERNEL_S1 - fldmias X, { s0 } - fldmias Y, { s4 } - fstmias Y, { s0 } - fstmias X, { s4 } + vldmia.f32 X, { s0 } + vldmia.f32 Y, { s4 } + vstmia.f32 Y, { s0 } + vstmia.f32 X, { s4 } add X, X, INC_X add Y, Y, INC_Y @@ -174,35 +174,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X, { d0 - d3 } - fldmiad Y, { d4 - d7 } - fstmiad Y!, { d0 - d3 } - fstmiad X!, { d4 - d7} + vldmia.f64 X, { d0 - d3 } + vldmia.f64 Y, { d4 - d7 } + vstmia.f64 Y!, { d0 - d3 } + vstmia.f64 X!, { d4 - d7} .endm .macro KERNEL_F1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y!, { d0 - d1 } - fstmiad X!, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y!, { d0 - d1 } + vstmia.f64 X!, { d4 - d5 } .endm .macro KERNEL_S1 - fldmiad X, { d0 - d1 } - fldmiad Y, { d4 - d5 } - fstmiad Y, { d0 - d1 } - fstmiad X, { d4 - d5 } + vldmia.f64 X, { d0 - d1 } + vldmia.f64 Y, { d4 - d5 } + vstmia.f64 Y, { d0 - d1 } + vstmia.f64 X, { d4 - d5 } add X, X, INC_X add Y, Y, INC_Y @@ -215,33 +215,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} - fldmias X, { s0 - s3 } - fldmias Y, { s4 - s7 } - fstmias Y!, { s0 - s3 } - fstmias X!, { s4 - s7} + vldmia.f32 X, { s0 - s3 } + vldmia.f32 Y, { s4 - s7 } + vstmia.f32 Y!, { s0 - s3 } + vstmia.f32 X!, { s4 - s7} .endm .macro KERNEL_F1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y!, { s0 - s1 } - fstmias X!, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y!, { s0 - s1 } + vstmia.f32 X!, { s4 - s5 } .endm .macro KERNEL_S1 - fldmias X, { s0 - s1 } - fldmias Y, { s4 - s5 } - fstmias Y, { s0 - s1 } - fstmias X, { s4 - s5 } + vldmia.f32 X, { s0 - s1 } + vldmia.f32 Y, { s4 - s5 } + vstmia.f32 Y, { s0 - s1 } + vstmia.f32 X, { s4 - s5 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S index 48aee4ce0..899dd1e36 100644 --- a/kernel/arm/zcopy_vfp.S +++ b/kernel/arm/zcopy_vfp.S @@ -66,15 +66,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ X, #X_PRE+32 ] - fldmiad X!, { d0 - d7 } - fstmiad Y!, { d0 - d7 } + vldmia.f64 X!, { d0 - d7 } + vstmia.f64 Y!, { d0 - d7 } .endm .macro COPY_F1 - fldmiad X!, { d0 - d1 } - fstmiad Y!, { d0 - d1 } + vldmia.f64 X!, { d0 - d1 } + vstmia.f64 Y!, { d0 - d1 } .endm @@ -84,23 +84,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S4 nop - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d2 - d3 } - fstmiad Y, { d2 - d3 } + vldmia.f64 X, { d2 - d3 } + vstmia.f64 Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_S1 - fldmiad X, { d0 - d1 } - fstmiad Y, { d0 - d1 } + vldmia.f64 X, { d0 - d1 } + vstmia.f64 Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index c0cd92d3c..5ef9f16a9 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -76,15 +76,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ X, #X_PRE ] pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 pld [ X, #X_PRE ] @@ -93,15 +93,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ Y, #X_PRE ] - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 - fldmiad X!, { d6 - d7 } + vldmia.f64 X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 - fldmiad Y!, { d10 - d11 } + vldmia.f64 Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 fmacd d2 , d7, d11 @@ -111,8 +111,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1 - fldmiad X!, { d4 - d5 } - fldmiad Y!, { d8 - d9 } + vldmia.f64 X!, { d4 - d5 } + vldmia.f64 Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -127,8 +127,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. nop - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -136,8 +136,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -145,8 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, INC_X add Y, Y, INC_Y - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 @@ -168,8 +168,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1 - fldmiad X, { d4 - d5 } - fldmiad Y, { d8 - d9 } + vldmia.f64 X, { d4 - d5 } + vldmia.f64 Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 53d18b07b..7934a500e 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -360,7 +360,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -372,9 +372,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } - fldmiad CO2, { d4 - d7 } + vldmia.f64 CO2, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -386,7 +386,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -543,23 +543,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } - fldmiad CO2, { d4 - d5 } + vldmia.f64 CO2, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -714,7 +714,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -726,7 +726,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -843,14 +843,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index a9d4eddeb..cbb10f342 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -374,8 +374,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } - fldmiad CO2, { d8 - d11 } + vldmia.f64 CO1, { d4 - d7 } + vldmia.f64 CO2, { d8 - d11 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -406,8 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -570,8 +570,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } - fldmiad CO2, { d8 - d9 } + vldmia.f64 CO1, { d4 - d5 } + vldmia.f64 CO2, { d8 - d9 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -588,8 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -752,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d7 } + vldmia.f64 CO1, { d4 - d7 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -769,7 +769,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -887,7 +887,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad CO1, { d4 - d5 } + vldmia.f64 CO1, { d4 - d5 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 @@ -897,7 +897,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S index b3fa225bb..d0661da2a 100644 --- a/kernel/arm/zgemm_ncopy_2_vfp.S +++ b/kernel/arm/zgemm_ncopy_2_vfp.S @@ -87,7 +87,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d6 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] - fstmiad BO!, { d0 - d7 } + vstmia.f64 BO!, { d0 - d7 } add AO2, AO2, #32 .endm @@ -101,7 +101,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO2, AO2, #16 .endm @@ -113,7 +113,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] - fstmiad BO!, { d0 - d3 } + vstmia.f64 BO!, { d0 - d3 } add AO1, AO1, #32 .endm @@ -124,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] - fstmiad BO!, { d0 - d1 } + vstmia.f64 BO!, { d0 - d1 } add AO1, AO1, #16 .endm diff --git a/kernel/arm/zgemm_tcopy_2_vfp.S b/kernel/arm/zgemm_tcopy_2_vfp.S index 7e27ca6a6..5e1a384b1 100644 --- a/kernel/arm/zgemm_tcopy_2_vfp.S +++ b/kernel/arm/zgemm_tcopy_2_vfp.S @@ -74,13 +74,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 pld [ AO1, #A_PRE ] - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] - fldmiad r3, { d4 - d7 } + vldmia.f64 r3, { d4 - d7 } - fstmiad BO1, { d0 - d7 } + vstmia.f64 BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -88,12 +88,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x2 - fldmiad AO1, { d0 -d1 } + vldmia.f64 AO1, { d0 -d1 } add r3, AO1, LDA - fldmiad r3, { d2 - d3 } + vldmia.f64 r3, { d2 - d3 } - fstmiad BO2, { d0 - d3 } + vstmia.f64 BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 @@ -102,9 +102,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*************************************************************************************************************************/ .macro COPY2x1 - fldmiad AO1, { d0 - d3 } + vldmia.f64 AO1, { d0 - d3 } - fstmiad BO1, { d0 - d3 } + vstmia.f64 BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 @@ -112,9 +112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY1x1 - fldmiad AO1, { d0 - d1 } + vldmia.f64 AO1, { d0 - d1 } - fstmiad BO2, { d0 - d1 } + vstmia.f64 BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index 3e3a1bc07..4e64d8785 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -204,7 +204,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -216,9 +216,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -230,7 +230,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -269,14 +269,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, #16 @@ -352,47 +352,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 2193083af..c66fa4fb8 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -151,12 +151,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F2X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 - fldmiad AO2!, { d8 - d9 } + vldmia.f64 AO2!, { d8 - d9 } KMAC_R d12 , d5 , d3 KMAC_I d13 , d5 , d2 @@ -169,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F2 - fldmiad YO, { d4 - d7 } + vldmia.f64 YO, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO!, { d4 - d7 } + vstmia.f64 YO!, { d4 - d7 } .endm @@ -205,8 +205,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F1X1 - fldmiad XO! , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO! , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -217,14 +217,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_F1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO!, { d4 - d5 } + vstmia.f64 YO!, { d4 - d5 } .endm @@ -250,9 +250,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S2X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } - fldmiad AO2!, { d8 - d9 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } + vldmia.f64 AO2!, { d8 - d9 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -270,25 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S2 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y - fldmiad YO, { d6 - d7 } + vldmia.f64 YO, { d6 - d7 } FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad YO, { d6 - d7 } + vstmia.f64 YO, { d6 - d7 } add YO, YO, INC_Y @@ -314,8 +314,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_S1X1 - fldmiad XO , { d2 - d3 } - fldmiad AO1!, { d4 - d5 } + vldmia.f64 XO , { d2 - d3 } + vldmia.f64 AO1!, { d4 - d5 } fmacd d12 , d4 , d2 fmacd d13 , d4 , d3 @@ -328,14 +328,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE_S1 - fldmiad YO, { d4 - d5 } + vldmia.f64 YO, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad YO, { d4 - d5 } + vstmia.f64 YO, { d4 - d5 } add YO, YO, INC_Y diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index cb6bc050e..4393bc9f6 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -385,7 +385,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -402,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 - fstmiad CO2, { d4 - d7 } + vstmia.f64 CO2, { d4 - d7 } add CO1, CO1, #32 @@ -567,7 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 @@ -577,7 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 - fstmiad CO2, { d4 - d5 } + vstmia.f64 CO2, { d4 - d5 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 3e6962f06..39b12caa0 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -391,8 +391,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 - fstmiad CO1, { d4 - d7 } - fstmiad CO2, { d8 - d11 } + vstmia.f64 CO1, { d4 - d7 } + vstmia.f64 CO2, { d8 - d11 } add CO1, CO1, #32 @@ -569,8 +569,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 - fstmiad CO1, { d4 - d5 } - fstmiad CO2, { d8 - d9 } + vstmia.f64 CO1, { d4 - d5 } + vstmia.f64 CO2, { d8 - d9 } add CO1, CO1, #16 @@ -747,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 - fstmiad CO1, { d4 - d7 } + vstmia.f64 CO1, { d4 - d7 } add CO1, CO1, #32 @@ -872,7 +872,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 - fstmiad CO1, { d4 - d5 } + vstmia.f64 CO1, { d4 - d5 } add CO1, CO1, #16 diff --git a/kernel/mips64/axpy_loongson3a.S b/kernel/mips64/axpy_loongson3a.S index 5904bc580..765e5ebbb 100644 --- a/kernel/mips64/axpy_loongson3a.S +++ b/kernel/mips64/axpy_loongson3a.S @@ -270,6 +270,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .L20: + beqz INCY, .L27 dsra I, N, 3 move YY, Y @@ -450,5 +451,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index f54008bc2..23225770a 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -562,6 +562,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //INCX!=1 or INCY != 1 .L20: + beq INCY, $0, .L27 dsra I, N, 3 move YY, Y @@ -754,5 +755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. j $31 NOP + .align 3 +.L27: + LD b1, 0 * SIZE(Y) +.L28: + daddiu N, N, -1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + bgtz N, .L28 + MADD b1, b1, ALPHA, a1 + + j .L999 + ST b1, 0 * SIZE(Y) + EPILOGUE diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 1256f4c3c..ba149512d 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -2,18 +2,12 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S +DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -#DTRMMKERNEL = ../generic/trmmkernel_16x2.c -#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S -#DGEMMINCOPY = ../generic/gemm_ncopy_16.c -#DGEMMITCOPY = ../generic/gemm_tcopy_16.c -#DGEMMONCOPY = ../generic/gemm_ncopy_2.c -#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c new file mode 100644 index 000000000..293bd4a99 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex.c @@ -0,0 +1,1642 @@ +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/* + * This file is based on dgemm_kernel_4x8_haswell.s (original copyright above). + * The content got translated from ASM to C+intrinsics, significantly simplified, + * and AVX512 support added by Arjan van de Ven + */ + + +#include "common.h" +#include + + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + + +/******************************************************************************************/ + + +#define INIT4x8() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + ymm8 = _mm256_setzero_pd(); \ + ymm9 = _mm256_setzero_pd(); \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + + +#define KERNEL4x8_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ +/* ymm0 [ A B C D ] */ \ + ymm1 = _mm256_loadu_pd(BO - 12); \ + ymm2 = _mm256_loadu_pd(BO - 8); \ +/* ymm1 [ 1 2 3 4 ] */ \ +/* ymm2 [ 5 6 7 8 ] */ \ + \ + ymm4 += ymm0 * ymm1; \ +/* ymm4 += [ A*1 | B*2 | C*3 | D*4 ] */ \ + ymm8 += ymm0 * ymm2; \ +/* ymm8 += [ A*5 | B*6 | C*7 | D*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ B A D C ] */ \ + ymm5 += ymm0 * ymm1; \ +/* ymm5 += [ B*1 | A*2 | D*3 | C*4 ] */ \ + ymm9 += ymm0 * ymm2; \ +/* ymm9 += [ B*5 | A*6 | D*7 | C*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ +/* ymm0 [ C D A B ]] */ \ + ymm6 += ymm0 * ymm1; \ +/* ymm6 += [ C*1 | D*2 | A*3 | B*4 ] */ \ + ymm10+= ymm0 * ymm2; \ +/* ymm10 += [ C*5 | D*6 | A*7 | B*8 ] */ \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ +/* ymm0 [ D C B A ] */ \ + ymm7 += ymm0 * ymm1; \ +/* ymm7 += [ D*1 | C*2 | B*3 | A*4 ] */ \ + ymm11+= ymm0 * ymm2; \ +/* ymm11 += [ D*5 | C*6 | B*7 | A*8 ] */ \ + AO += 4; \ + BO += 8; + + +#define SAVE4x8(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + ymm8 *= ymm0; \ + ymm9 *= ymm0; \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + \ +/* Entry values: */ \ +/* ymm4 = a [ A*1 | B*2 | C*3 | D*4 ] */ \ +/* ymm5 = a [ B*1 | A*2 | D*3 | C*4 ] */ \ +/* ymm6 = a [ C*1 | D*2 | A*3 | B*4 ] */ \ +/* ymm7 = a [ D*1 | C*2 | B*3 | A*4 ] */ \ +/* ymm8 = a [ A*5 | B*6 | C*7 | D*8 ] */ \ +/* ymm9 = a [ B*5 | A*6 | D*7 | C*8 ] */ \ +/* ymm10 = a [ C*5 | D*6 | A*7 | B*8 ] */ \ +/* ymm11 = a [ D*5 | C*6 | B*7 | A*8 ] */ \ + \ + ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ +/* ymm5 = a [ A*2 | B*1 | C*4 | D*3 ] */ \ + ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ +/* ymm7 = a [ C*2 | D*1 | A*4 | B*3 ] */ \ + \ + ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ +/* ymm0 = a [ A*1 | B*1 | C*3 | D*3 ] */ \ +/* ymm1 = a [ A*2 | B*2 | C*4 | D*4 ] */ \ + ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ +/* ymm2 = a [ C*1 | D*1 | A*3 | B*3 ] */ \ +/* ymm3 = a [ C*2 | D*2 | A*4 | B*4 ] */ \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ +/* ymm2 = a [ B*3 | A*3 | D*1 | C*1 ] */ \ +/* ymm3 = a [ B*4 | A*4 | D*2 | C*2 ] */ \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ +/* ymm2 = a [ A*3 | B*3 | C*1 | D*1 ] */ \ +/* ymm3 = a [ A*4 | B*4 | C*2 | D*2 ] */ \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ +/* ymm4 = a [ A*1 | B*1 | C*1 | D*1 ] */ \ +/* ymm5 = a [ A*2 | B*2 | C*2 | D*2 ] */ \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ +/* ymm5 = a [ A*3 | B*3 | C*3 | D*3 ] */ \ +/* ymm7 = a [ A*4 | B*4 | C*4 | D*4 ] */ \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + ymm9 = _mm256_permute4x64_pd(ymm9, 0xb1); \ + ymm11 = _mm256_permute4x64_pd(ymm11, 0xb1); \ + \ + ymm0 = _mm256_blend_pd(ymm8, ymm9, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm8, ymm9, 0x05); \ + ymm2 = _mm256_blend_pd(ymm10, ymm11, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm10, ymm11, 0x05); \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (4 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (5 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (6 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (7 * ldc)); \ + _mm256_storeu_pd(CO1 + (4 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (5 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (6 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (7 * ldc), ymm7); \ + \ + CO1 += 4; + +/******************************************************************************************/ + +#define INIT2x8() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + xmm8 = _mm_setzero_pd(); \ + xmm9 = _mm_setzero_pd(); \ + xmm10 = _mm_setzero_pd(); \ + xmm11 = _mm_setzero_pd(); \ + + +#define KERNEL2x8_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 8)); \ + xmm6 += xmm0 * xmm3; \ + xmm3 = _mm_set1_pd(*(BO - 7)); \ + xmm7 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 6)); \ + xmm8 += xmm0 * xmm2; \ + xmm2 = _mm_set1_pd(*(BO - 5)); \ + xmm9 += xmm0 * xmm3; \ + xmm10 += xmm0 * xmm1; \ + xmm11 += xmm0 * xmm2; \ + BO += 8; \ + AO += 2; + +#define SAVE2x8(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + xmm8 *= xmm0; \ + xmm9 *= xmm0; \ + xmm10 *= xmm0; \ + xmm11 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + xmm8 += _mm_loadu_pd(CO1 + (4 * ldc)); \ + xmm9 += _mm_loadu_pd(CO1 + (5 * ldc)); \ + xmm10+= _mm_loadu_pd(CO1 + (6 * ldc)); \ + xmm11+= _mm_loadu_pd(CO1 + (7 * ldc)); \ + _mm_storeu_pd(CO1 + (4 * ldc), xmm8); \ + _mm_storeu_pd(CO1 + (5 * ldc), xmm9); \ + _mm_storeu_pd(CO1 + (6 * ldc), xmm10); \ + _mm_storeu_pd(CO1 + (7 * ldc), xmm11); \ + CO1 += 2; + + + + +/******************************************************************************************/ + +#define INIT1x8() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + dbl8 = 0; \ + dbl9 = 0; \ + dbl10 = 0; \ + dbl11 = 0; + + +#define KERNEL1x8_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl4 += dbl0 * dbl1; \ + dbl1 = *(BO - 9); \ + dbl5 += dbl0 * dbl2; \ + dbl2 = *(BO - 8); \ + dbl6 += dbl0 * dbl3; \ + dbl3 = *(BO - 7); \ + dbl7 += dbl0 * dbl1; \ + dbl1 = *(BO - 6); \ + dbl8 += dbl0 * dbl2; \ + dbl2 = *(BO - 5); \ + dbl9 += dbl0 * dbl3; \ + dbl10 += dbl0 * dbl1; \ + dbl11 += dbl0 * dbl2; \ + BO += 8; \ + AO += 1; + + +#define SAVE1x8(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + dbl8 *= dbl0; \ + dbl9 *= dbl0; \ + dbl10 *= dbl0; \ + dbl11 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + dbl8 += *(CO1 + (4 * ldc)); \ + dbl9 += *(CO1 + (5 * ldc)); \ + dbl10 += *(CO1 + (6 * ldc)); \ + dbl11 += *(CO1 + (7 * ldc)); \ + *(CO1 + (4 * ldc)) = dbl8; \ + *(CO1 + (5 * ldc)) = dbl9; \ + *(CO1 + (6 * ldc)) = dbl10; \ + *(CO1 + (7 * ldc)) = dbl11; \ + \ + CO1 += 1; + + + + + + +/******************************************************************************************/ + +#define INIT4x4() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL4x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(BO - 12); \ + \ + ymm4 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm5 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \ + ymm6 += ymm0 * ymm1; \ + \ + ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \ + ymm7 += ymm0 * ymm1; \ + AO += 4; \ + BO += 4; + + +#define SAVE4x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \ + ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \ + \ + ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \ + ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \ + ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \ + ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \ + \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \ + ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \ + ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \ + \ + ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \ + ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \ + ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \ + ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \ + \ + ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \ + ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \ + ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \ + _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \ + _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \ + _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \ + _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x4() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + + +#define KERNEL2x4_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_set1_pd(*(BO - 12)); \ + xmm2 = _mm_set1_pd(*(BO - 11)); \ + xmm3 = _mm_set1_pd(*(BO - 10)); \ + xmm4 += xmm0 * xmm1; \ + xmm1 = _mm_set1_pd(*(BO - 9)); \ + xmm5 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm0 * xmm1; \ + BO += 4; \ + AO += 2; + + + +#define SAVE2x4(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \ + xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \ + xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \ + \ + _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \ + _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \ + _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \ + _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \ + \ + CO1 += 2; + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x4() \ + dbl4 = 0; \ + dbl5 = 0; \ + dbl6 = 0; \ + dbl7 = 0; \ + +#define KERNEL1x4_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl3 = *(BO - 10); \ + dbl8 = *(BO - 9); \ + \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + dbl6 += dbl0 * dbl3; \ + dbl7 += dbl0 * dbl8; \ + BO += 4; \ + AO += 1; + + +#define SAVE1x4(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + dbl6 *= dbl0; \ + dbl7 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + dbl6 += *(CO1 + (2 * ldc)); \ + dbl7 += *(CO1 + (3 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + *(CO1 + (2 * ldc)) = dbl6; \ + *(CO1 + (3 * ldc)) = dbl7; \ + \ + \ + CO1 += 1; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT8x4() \ + ymm10 = _mm256_setzero_pd(); \ + ymm11 = _mm256_setzero_pd(); \ + ymm12 = _mm256_setzero_pd(); \ + ymm13 = _mm256_setzero_pd(); \ + ymm14 = _mm256_setzero_pd(); \ + ymm15 = _mm256_setzero_pd(); \ + ymm16 = _mm256_setzero_pd(); \ + ymm17 = _mm256_setzero_pd(); \ + + +#define KERNEL8x4_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 = _mm256_set1_pd(*(BO - 10)); \ + ymm5 = _mm256_set1_pd(*(BO - 9)); \ + ymm10 += ymm0 * ymm2; \ + ymm11 += ymm1 * ymm2; \ + ymm12 += ymm0 * ymm3; \ + ymm13 += ymm1 * ymm3; \ + ymm14 += ymm0 * ymm4; \ + ymm15 += ymm1 * ymm4; \ + ymm16 += ymm0 * ymm5; \ + ymm17 += ymm1 * ymm5; \ + BO += 4; \ + AO += 8; + + + +#define SAVE8x4(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm10 *= ymm0; \ + ymm11 *= ymm0; \ + ymm12 *= ymm0; \ + ymm13 *= ymm0; \ + ymm14 *= ymm0; \ + ymm15 *= ymm0; \ + ymm16 *= ymm0; \ + ymm17 *= ymm0; \ + \ + ymm10 += _mm256_loadu_pd(CO1); \ + ymm11 += _mm256_loadu_pd(CO1 + 4); \ + ymm12 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm13 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + ymm14 += _mm256_loadu_pd(CO1 + (ldc*2)); \ + ymm15 += _mm256_loadu_pd(CO1 + (ldc*2) + 4); \ + ymm16 += _mm256_loadu_pd(CO1 + (ldc*3)); \ + ymm17 += _mm256_loadu_pd(CO1 + (ldc*3) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm10); \ + _mm256_storeu_pd(CO1 + 4, ymm11); \ + _mm256_storeu_pd(CO1 + ldc, ymm12); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm13); \ + _mm256_storeu_pd(CO1 + ldc*2, ymm14); \ + _mm256_storeu_pd(CO1 + ldc*2 + 4, ymm15); \ + _mm256_storeu_pd(CO1 + ldc*3, ymm16); \ + _mm256_storeu_pd(CO1 + ldc*3 + 4, ymm17); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT8x2() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); \ + + +#define KERNEL8x2_SUB() \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm1 = _mm256_loadu_pd(AO - 12); \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm3 = _mm256_set1_pd(*(BO - 11)); \ + ymm4 += ymm0 * ymm2; \ + ymm5 += ymm1 * ymm2; \ + ymm6 += ymm0 * ymm3; \ + ymm7 += ymm1 * ymm3; \ + BO += 2; \ + AO += 8; + + + +#define SAVE8x2(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 *= ymm0; \ + ymm5 *= ymm0; \ + ymm6 *= ymm0; \ + ymm7 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + ymm5 += _mm256_loadu_pd(CO1 + 4); \ + ymm6 += _mm256_loadu_pd(CO1 + (ldc)); \ + ymm7 += _mm256_loadu_pd(CO1 + (ldc) + 4); \ + \ + _mm256_storeu_pd(CO1, ymm4); \ + _mm256_storeu_pd(CO1 + 4, ymm5); \ + _mm256_storeu_pd(CO1 + ldc, ymm6); \ + _mm256_storeu_pd(CO1 + ldc + 4, ymm7); \ + \ + CO1 += 8; + + +/******************************************************************************************/ +/******************************************************************************************/ +#define INIT4x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm5 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + xmm7 = _mm_setzero_pd(); \ + + +#define KERNEL4x2_SUB() \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm1 = _mm_loadu_pd(AO - 14); \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm5 += xmm1 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + xmm7 += xmm1 * xmm3; \ + BO += 2; \ + AO += 4; + + + +#define SAVE4x2(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm5 *= xmm0; \ + xmm6 *= xmm0; \ + xmm7 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm5 += _mm_loadu_pd(CO1 + 2); \ + xmm6 += _mm_loadu_pd(CO1 + (ldc)); \ + xmm7 += _mm_loadu_pd(CO1 + (ldc) + 2); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + 2, xmm5); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + _mm_storeu_pd(CO1 + ldc + 2, xmm7); \ + \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x2() \ + xmm4 = _mm_setzero_pd(); \ + xmm6 = _mm_setzero_pd(); \ + + + +#define KERNEL2x2_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm3 = _mm_set1_pd(*(BO - 11)); \ + xmm4 += xmm0 * xmm2; \ + xmm6 += xmm0 * xmm3; \ + BO += 2; \ + AO += 2; + + +#define SAVE2x2(ALPHA) \ + if (ALPHA != 1.0) { \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + xmm6 *= xmm0; \ + } \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + xmm6 += _mm_loadu_pd(CO1 + ldc); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + _mm_storeu_pd(CO1 + ldc, xmm6); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x2() \ + dbl4 = 0; \ + dbl5 = 0; + + +#define KERNEL1x2_SUB() \ + dbl0 = *(AO - 16); \ + dbl1 = *(BO - 12); \ + dbl2 = *(BO - 11); \ + dbl4 += dbl0 * dbl1; \ + dbl5 += dbl0 * dbl2; \ + BO += 2; \ + AO += 1; + + +#define SAVE1x2(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl5 *= dbl0; \ + \ + dbl4 += *(CO1 + (0 * ldc)); \ + dbl5 += *(CO1 + (1 * ldc)); \ + *(CO1 + (0 * ldc)) = dbl4; \ + *(CO1 + (1 * ldc)) = dbl5; \ + \ + \ + CO1 += 1; + + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT4x1() \ + ymm4 = _mm256_setzero_pd(); \ + ymm5 = _mm256_setzero_pd(); \ + ymm6 = _mm256_setzero_pd(); \ + ymm7 = _mm256_setzero_pd(); + + +#define KERNEL4x1() \ + ymm0 = _mm256_set1_pd(*(BO - 12)); \ + ymm1 = _mm256_set1_pd(*(BO - 11)); \ + ymm2 = _mm256_set1_pd(*(BO - 10)); \ + ymm3 = _mm256_set1_pd(*(BO - 9)); \ + \ + ymm4 += _mm256_loadu_pd(AO - 16) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO - 12) * ymm1; \ + \ + ymm0 = _mm256_set1_pd(*(BO - 8)); \ + ymm1 = _mm256_set1_pd(*(BO - 7)); \ + \ + ymm6 += _mm256_loadu_pd(AO - 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO - 4) * ymm3; \ + \ + ymm2 = _mm256_set1_pd(*(BO - 6)); \ + ymm3 = _mm256_set1_pd(*(BO - 5)); \ + \ + ymm4 += _mm256_loadu_pd(AO + 0) * ymm0; \ + ymm5 += _mm256_loadu_pd(AO + 4) * ymm1; \ + ymm6 += _mm256_loadu_pd(AO + 8) * ymm2; \ + ymm7 += _mm256_loadu_pd(AO + 12) * ymm3; \ + \ + BO += 8; \ + AO += 32; + + +#define INIT8x1() \ + zmm4 = _mm512_setzero_pd(); \ + + +#define KERNEL8x1_SUB() \ + zmm2 = _mm512_set1_pd(*(BO - 12)); \ + zmm0 = _mm512_loadu_pd(AO - 16); \ + zmm4 += zmm0 * zmm2; \ + BO += 1; \ + AO += 8; + + +#define SAVE8x1(ALPHA) \ + zmm0 = _mm512_set1_pd(ALPHA); \ + zmm4 *= zmm0; \ + \ + zmm4 += _mm512_loadu_pd(CO1); \ + _mm512_storeu_pd(CO1, zmm4); \ + CO1 += 8; + +#define KERNEL4x1_SUB() \ + ymm2 = _mm256_set1_pd(*(BO - 12)); \ + ymm0 = _mm256_loadu_pd(AO - 16); \ + ymm4 += ymm0 * ymm2; \ + BO += 1; \ + AO += 4; + + +#define SAVE4x1(ALPHA) \ + ymm0 = _mm256_set1_pd(ALPHA); \ + ymm4 += ymm5; \ + ymm6 += ymm7; \ + ymm4 += ymm6; \ + ymm4 *= ymm0; \ + \ + ymm4 += _mm256_loadu_pd(CO1); \ + _mm256_storeu_pd(CO1, ymm4); \ + CO1 += 4; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT2x1() \ + xmm4 = _mm_setzero_pd(); + + +#define KERNEL2x1_SUB() \ + xmm2 = _mm_set1_pd(*(BO - 12)); \ + xmm0 = _mm_loadu_pd(AO - 16); \ + xmm4 += xmm0 * xmm2; \ + BO += 1; \ + AO += 2; + + +#define SAVE2x1(ALPHA) \ + xmm0 = _mm_set1_pd(ALPHA); \ + xmm4 *= xmm0; \ + \ + xmm4 += _mm_loadu_pd(CO1); \ + \ + _mm_storeu_pd(CO1, xmm4); \ + \ + CO1 += 2; + + +/******************************************************************************************/ +/******************************************************************************************/ + +#define INIT1x1() \ + dbl4 = 0; + +#define KERNEL1x1_SUB() \ + dbl1 = *(BO - 12); \ + dbl0 = *(AO - 16); \ + dbl4 += dbl0 * dbl1; \ + BO += 1; \ + AO += 1; + +#define SAVE1x1(ALPHA) \ + dbl0 = ALPHA; \ + dbl4 *= dbl0; \ + dbl4 += *CO1; \ + *CO1 = dbl4; \ + CO1 += 1; + + +/*******************************************************************************************/ + +/* START */ + + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc) +{ + unsigned long M=m, N=n, K=k; + + + if (M == 0) + return 0; + if (N == 0) + return 0; + if (K == 0) + return 0; + + while (N >= 8) { + double *CO1; + double *AO; + int i; + + CO1 = C; + C += 8 * ldc; + + AO = A + 16; + + i = m; + + while (i >= 24) { + double *BO; + double *A1, *A2; + int kloop = K; + + BO = B + 12; + A1 = AO + 8 * K; + A2 = AO + 16 * K; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "vmovapd %%zmm1, %%zmm21\n" + "vmovapd %%zmm1, %%zmm22\n" + "vmovapd %%zmm1, %%zmm23\n" + "vmovapd %%zmm1, %%zmm24\n" + "vmovapd %%zmm1, %%zmm25\n" + "vmovapd %%zmm1, %%zmm26\n" + "vmovapd %%zmm1, %%zmm27\n" + "vmovapd %%zmm1, %%zmm28\n" + "jmp .label24\n" + ".align 32\n" + /* Inner math loop */ + ".label24:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + "vmovupd -128(%[A2]),%%zmm20\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm21\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm22\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm23\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm24\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm25\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm26\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm27\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "vfmadd231pd %%zmm9, %%zmm20, %%zmm28\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[A2]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[A2])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label24\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + "vmulpd %%zmm9, %%zmm11, %%zmm11\n" + "vmulpd %%zmm9, %%zmm12, %%zmm12\n" + "vmulpd %%zmm9, %%zmm13, %%zmm13\n" + "vmulpd %%zmm9, %%zmm14, %%zmm14\n" + "vmulpd %%zmm9, %%zmm15, %%zmm15\n" + "vmulpd %%zmm9, %%zmm16, %%zmm16\n" + "vmulpd %%zmm9, %%zmm17, %%zmm17\n" + "vmulpd %%zmm9, %%zmm18, %%zmm18\n" + "vmulpd %%zmm9, %%zmm21, %%zmm21\n" + "vmulpd %%zmm9, %%zmm22, %%zmm22\n" + "vmulpd %%zmm9, %%zmm23, %%zmm23\n" + "vmulpd %%zmm9, %%zmm24, %%zmm24\n" + "vmulpd %%zmm9, %%zmm25, %%zmm25\n" + "vmulpd %%zmm9, %%zmm26, %%zmm26\n" + "vmulpd %%zmm9, %%zmm27, %%zmm27\n" + "vmulpd %%zmm9, %%zmm28, %%zmm28\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" + "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" + "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" + "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" + "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" + "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" + "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" + "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + "vaddpd 128(%[C0]), %%zmm21, %%zmm21\n" + "vaddpd 128(%[C1]), %%zmm22, %%zmm22\n" + "vaddpd 128(%[C2]), %%zmm23, %%zmm23\n" + "vaddpd 128(%[C3]), %%zmm24, %%zmm24\n" + "vaddpd 128(%[C4]), %%zmm25, %%zmm25\n" + "vaddpd 128(%[C5]), %%zmm26, %%zmm26\n" + "vaddpd 128(%[C6]), %%zmm27, %%zmm27\n" + "vaddpd 128(%[C7]), %%zmm28, %%zmm28\n" + "vmovupd %%zmm21, 128(%[C0])\n" + "vmovupd %%zmm22, 128(%[C1])\n" + "vmovupd %%zmm23, 128(%[C2])\n" + "vmovupd %%zmm24, 128(%[C3])\n" + "vmovupd %%zmm25, 128(%[C4])\n" + "vmovupd %%zmm26, 128(%[C5])\n" + "vmovupd %%zmm27, 128(%[C6])\n" + "vmovupd %%zmm28, 128(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [A2] "+r" (A2), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", + "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28" + ); + CO1 += 24; + AO += 16 * K; + i-= 24; + } + + + while (i >= 16) { + double *BO; + double *A1; + int kloop = K; + + BO = B + 12; + A1 = AO + 8 * K; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vmovapd %%zmm1, %%zmm11\n" + "vmovapd %%zmm1, %%zmm12\n" + "vmovapd %%zmm1, %%zmm13\n" + "vmovapd %%zmm1, %%zmm14\n" + "vmovapd %%zmm1, %%zmm15\n" + "vmovapd %%zmm1, %%zmm16\n" + "vmovapd %%zmm1, %%zmm17\n" + "vmovapd %%zmm1, %%zmm18\n" + "jmp .label16\n" + ".align 32\n" + /* Inner math loop */ + ".label16:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vmovupd -128(%[A1]),%%zmm10\n" + + "vbroadcastsd -96(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm1\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm11\n" + + "vbroadcastsd -88(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm2\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm12\n" + + "vbroadcastsd -80(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm3\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm13\n" + + "vbroadcastsd -72(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm4\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm14\n" + + "vbroadcastsd -64(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm5\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm15\n" + + "vbroadcastsd -56(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm6\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm16\n" + + "vbroadcastsd -48(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm7\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm17\n" + + "vbroadcastsd -40(%[BO]), %%zmm9\n" + "vfmadd231pd %%zmm9, %%zmm0, %%zmm8\n" + "vfmadd231pd %%zmm9, %%zmm10, %%zmm18\n" + "add $64, %[AO]\n" + "add $64, %[A1]\n" + "add $64, %[BO]\n" + "prefetch 512(%[AO])\n" + "prefetch 512(%[A1])\n" + "prefetch 512(%[BO])\n" + "subl $1, %[kloop]\n" + "jg .label16\n" + /* multiply the result by alpha */ + "vbroadcastsd (%[alpha]), %%zmm9\n" + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + "vmulpd %%zmm9, %%zmm11, %%zmm11\n" + "vmulpd %%zmm9, %%zmm12, %%zmm12\n" + "vmulpd %%zmm9, %%zmm13, %%zmm13\n" + "vmulpd %%zmm9, %%zmm14, %%zmm14\n" + "vmulpd %%zmm9, %%zmm15, %%zmm15\n" + "vmulpd %%zmm9, %%zmm16, %%zmm16\n" + "vmulpd %%zmm9, %%zmm17, %%zmm17\n" + "vmulpd %%zmm9, %%zmm18, %%zmm18\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + + "vaddpd 64(%[C0]), %%zmm11, %%zmm11\n" + "vaddpd 64(%[C1]), %%zmm12, %%zmm12\n" + "vaddpd 64(%[C2]), %%zmm13, %%zmm13\n" + "vaddpd 64(%[C3]), %%zmm14, %%zmm14\n" + "vaddpd 64(%[C4]), %%zmm15, %%zmm15\n" + "vaddpd 64(%[C5]), %%zmm16, %%zmm16\n" + "vaddpd 64(%[C6]), %%zmm17, %%zmm17\n" + "vaddpd 64(%[C7]), %%zmm18, %%zmm18\n" + "vmovupd %%zmm11, 64(%[C0])\n" + "vmovupd %%zmm12, 64(%[C1])\n" + "vmovupd %%zmm13, 64(%[C2])\n" + "vmovupd %%zmm14, 64(%[C3])\n" + "vmovupd %%zmm15, 64(%[C4])\n" + "vmovupd %%zmm16, 64(%[C5])\n" + "vmovupd %%zmm17, 64(%[C6])\n" + "vmovupd %%zmm18, 64(%[C7])\n" + + : + [AO] "+r" (AO), + [A1] "+r" (A1), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", + "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18" + ); + CO1 += 16; + AO += 8 * K; + i-= 16; + } + + while (i >= 8) { + double *BO; + int kloop = K; + + BO = B + 12; + /* + * This is the inner loop for the hot hot path + * Written in inline asm because compilers like GCC 8 and earlier + * struggle with register allocation and are not good at using + * the AVX512 built in broadcast ability (1to8) + */ + asm( + "vxorpd %%zmm1, %%zmm1, %%zmm1\n" + "vmovapd %%zmm1, %%zmm2\n" + "vmovapd %%zmm1, %%zmm3\n" + "vmovapd %%zmm1, %%zmm4\n" + "vmovapd %%zmm1, %%zmm5\n" + "vmovapd %%zmm1, %%zmm6\n" + "vmovapd %%zmm1, %%zmm7\n" + "vmovapd %%zmm1, %%zmm8\n" + "vbroadcastsd (%[alpha]), %%zmm9\n" + "jmp .label1\n" + ".align 32\n" + /* Inner math loop */ + ".label1:\n" + "vmovupd -128(%[AO]),%%zmm0\n" + "vfmadd231pd -96(%[BO])%{1to8%}, %%zmm0, %%zmm1\n" + "vfmadd231pd -88(%[BO])%{1to8%}, %%zmm0, %%zmm2\n" + "vfmadd231pd -80(%[BO])%{1to8%}, %%zmm0, %%zmm3\n" + "vfmadd231pd -72(%[BO])%{1to8%}, %%zmm0, %%zmm4\n" + "vfmadd231pd -64(%[BO])%{1to8%}, %%zmm0, %%zmm5\n" + "vfmadd231pd -56(%[BO])%{1to8%}, %%zmm0, %%zmm6\n" + "vfmadd231pd -48(%[BO])%{1to8%}, %%zmm0, %%zmm7\n" + "vfmadd231pd -40(%[BO])%{1to8%}, %%zmm0, %%zmm8\n" + "add $64, %[AO]\n" + "add $64, %[BO]\n" + "subl $1, %[kloop]\n" + "jg .label1\n" + /* multiply the result by alpha */ + "vmulpd %%zmm9, %%zmm1, %%zmm1\n" + "vmulpd %%zmm9, %%zmm2, %%zmm2\n" + "vmulpd %%zmm9, %%zmm3, %%zmm3\n" + "vmulpd %%zmm9, %%zmm4, %%zmm4\n" + "vmulpd %%zmm9, %%zmm5, %%zmm5\n" + "vmulpd %%zmm9, %%zmm6, %%zmm6\n" + "vmulpd %%zmm9, %%zmm7, %%zmm7\n" + "vmulpd %%zmm9, %%zmm8, %%zmm8\n" + /* And store additively in C */ + "vaddpd (%[C0]), %%zmm1, %%zmm1\n" + "vaddpd (%[C1]), %%zmm2, %%zmm2\n" + "vaddpd (%[C2]), %%zmm3, %%zmm3\n" + "vaddpd (%[C3]), %%zmm4, %%zmm4\n" + "vaddpd (%[C4]), %%zmm5, %%zmm5\n" + "vaddpd (%[C5]), %%zmm6, %%zmm6\n" + "vaddpd (%[C6]), %%zmm7, %%zmm7\n" + "vaddpd (%[C7]), %%zmm8, %%zmm8\n" + "vmovupd %%zmm1, (%[C0])\n" + "vmovupd %%zmm2, (%[C1])\n" + "vmovupd %%zmm3, (%[C2])\n" + "vmovupd %%zmm4, (%[C3])\n" + "vmovupd %%zmm5, (%[C4])\n" + "vmovupd %%zmm6, (%[C5])\n" + "vmovupd %%zmm7, (%[C6])\n" + "vmovupd %%zmm8, (%[C7])\n" + "prefetchw 64(%[C0])\n" + "prefetchw 64(%[C1])\n" + "prefetchw 64(%[C2])\n" + "prefetchw 64(%[C3])\n" + "prefetchw 64(%[C4])\n" + "prefetchw 64(%[C5])\n" + "prefetchw 64(%[C6])\n" + "prefetchw 64(%[C7])\n" + : + [AO] "+r" (AO), + [BO] "+r" (BO), + [C0] "+r" (CO1), + [kloop] "+r" (kloop) + : + [alpha] "r" (&alpha), + [C1] "r" (CO1 + 1 * ldc), + [C2] "r" (CO1 + 2 * ldc), + [C3] "r" (CO1 + 3 * ldc), + [C4] "r" (CO1 + 4 * ldc), + [C5] "r" (CO1 + 5 * ldc), + [C6] "r" (CO1 + 6 * ldc), + [C7] "r" (CO1 + 7 * ldc) + + : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9" + ); + CO1 += 8; + i-= 8; + } + + + + while (i >= 4) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11; + int kloop = K; + + BO = B + 12; + INIT4x8() + + while (kloop > 0) { + KERNEL4x8_SUB() + kloop--; + } + SAVE4x8(alpha) + i-= 4; + } + + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + int kloop = K; + + BO = B + 12; + INIT2x8() + + while (kloop > 0) { + KERNEL2x8_SUB() + kloop--; + } + SAVE2x8(alpha) + i -= 2; + } + + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8, dbl9, dbl10, dbl11; + int kloop = K; + + BO = B + 12; + INIT1x8() + + while (kloop > 0) { + KERNEL1x8_SUB() + kloop--; + } + SAVE1x8(alpha) + i -= 1; + } + B += K * 8; + N -= 8; + } + + if (N == 0) + return 0; + + + + // L8_0 + while (N >= 4) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 4 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + // L8_11 + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm10, ymm11,ymm12,ymm13,ymm14,ymm15,ymm16,ymm17; + BO = B + 12; + int kloop = K; + + INIT8x4() + + while (kloop > 0) { + // L12_17 + KERNEL8x4_SUB() + kloop--; + } + // L8_19 + SAVE8x4(alpha) + + i -= 8; + } + while (i >= 4) { + // L8_11 + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + BO = B + 12; + int kloop = K; + + INIT4x4() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x4_SUB() + kloop--; + } + // L8_19 + SAVE4x4(alpha) + + i -= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + BO = B; + BO += 12; + + INIT2x4() + int kloop = K; + + while (kloop > 0) { + KERNEL2x4_SUB() + kloop--; + } + SAVE2x4(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8; + int kloop = K; + BO = B + 12; + INIT1x4() + + while (kloop > 0) { + KERNEL1x4_SUB() + kloop--; + } + SAVE1x4(alpha) + i -= 1; + } + + B += K * 4; + N -= 4; + } + +/**************************************************************************************************/ + + // L8_0 + while (N >= 2) { + double *CO1; + double *AO; + int i; + // L8_10 + CO1 = C; + C += 2 * ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x2_SUB() + kloop--; + } + // L8_19 + SAVE8x2(alpha) + + i-=8; + } + + while (i >= 4) { + double *BO; + __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x2() + + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x2_SUB() + kloop--; + } + // L8_19 + SAVE4x2(alpha) + + i-=4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm3, xmm4, xmm6; + int kloop = K; + BO = B + 12; + + INIT2x2() + + while (kloop > 0) { + KERNEL2x2_SUB() + kloop--; + } + SAVE2x2(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl2, dbl4, dbl5; + int kloop = K; + BO = B + 12; + + INIT1x2() + + while (kloop > 0) { + KERNEL1x2_SUB() + kloop--; + } + SAVE1x2(alpha) + i -= 1; + } + + B += K * 2; + N -= 2; + } + + // L8_0 + while (N >= 1) { + // L8_10 + double *CO1; + double *AO; + int i; + + CO1 = C; + C += ldc; + + AO = A + 16; + + i = m; + while (i >= 8) { + double *BO; + __m512d zmm0, zmm2, zmm4; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT8x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL8x1_SUB() + kloop--; + } + // L8_19 + SAVE8x1(alpha) + + i-= 8; + } + while (i >= 4) { + double *BO; + __m256d ymm0, ymm2, ymm4, ymm5, ymm6, ymm7; + // L8_11 + BO = B + 12; + int kloop = K; + + INIT4x1() + // L8_16 + while (kloop > 0) { + // L12_17 + KERNEL4x1_SUB() + kloop--; + } + // L8_19 + SAVE4x1(alpha) + + i-= 4; + } + +/************************************************************************** +* Rest of M +***************************************************************************/ + + while (i >= 2) { + double *BO; + __m128d xmm0, xmm2, xmm4; + int kloop = K; + BO = B; + BO += 12; + + INIT2x1() + + while (kloop > 0) { + KERNEL2x1_SUB() + kloop--; + } + SAVE2x1(alpha) + i -= 2; + } + // L13_40 + while (i >= 1) { + double *BO; + double dbl0, dbl1, dbl4; + int kloop = K; + + BO = B; + BO += 12; + INIT1x1() + + + while (kloop > 0) { + KERNEL1x1_SUB() + kloop--; + } + SAVE1x1(alpha) + i -= 1; + } + + B += K * 1; + N -= 1; + } + + + return 0; +} diff --git a/utest/test_fork.c b/utest/test_fork.c index 9fc51287c..0b90407b1 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -#include "openblas_utest.h" #include #include #include +#include "openblas_utest.h" void* xmalloc(size_t n) {