diff --git a/CMakeLists.txt b/CMakeLists.txt index c20a57eac..e6ae891b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h ) - ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h) + + ADD_CUSTOM_TARGET(genconfig + ALL + DEPENDS openblas_config.h + ) add_dependencies(genconfig ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) @@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME} message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(genf77blas + ALL COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h ) @@ -255,11 +260,11 @@ if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(gencblas + ALL COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h ) - add_dependencies(gencblas ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/Makefile.arm b/Makefile.arm index c189b0c47..eedd39b73 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,5 +1,4 @@ -#ifeq logical or -ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) CCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a @@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a endif endif -ifeq ($(CORE), ARMV7) -ifeq ($(OSNAME), Android) -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a -else -CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -endif -else -CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -endif -endif - ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp -march=armv6 endif - ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -march=armv5 -FCOMMON_OPT += -marm -march=armv5 +CCOMMON_OPT += -march=armv5 +FCOMMON_OPT += -march=armv5 endif diff --git a/Makefile.arm64 b/Makefile.arm64 index 7e9df2f4b..d19e796a5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx endif ifeq ($(CORE), THUNDERX2T99) -CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan -FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan +CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 +FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 endif diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 115bdaf4e..56ae612ea 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF} "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${FU}\n") +if (${HOST_OS} STREQUAL "WINDOWSSTORE") + file(APPEND ${TARGET_CONF} + "#define OS_WINNT\t1\n") +endif () + diff --git a/cmake/os.cmake b/cmake/os.cmake index f5a75027c..e9df68d7f 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -77,7 +77,7 @@ if (CYGWIN) set(NO_EXPRECISION 1) endif () -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android") if (SMP) set(EXTRALIB "${EXTRALIB} -lpthread") endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6a21c0bcc..a7f98bfb8 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -72,20 +72,26 @@ if (MSVC) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) endif() +if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + # disable WindowsStore strict CRT checks + set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) +endif () + set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) -try_compile(GETARCH_RESULT ${GETARCH_DIR} - SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH_RESULT ${GETARCH_DIR} + SOURCES ${GETARCH_SRC} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} + ) -if (NOT ${GETARCH_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") + if (NOT ${GETARCH_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") + endif () endif () - message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) -try_compile(GETARCH2_RESULT ${GETARCH2_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH2_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH2_RESULT ${GETARCH2_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH2_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} + ) -if (NOT ${GETARCH2_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") + if (NOT ${GETARCH2_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") + endif () endif () # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}") set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) -try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GEN_CONFIG_H_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} -) +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GEN_CONFIG_H_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} + ) -if (NOT ${GEN_CONFIG_H_RESULT}) - MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") -endif () + if (NOT ${GEN_CONFIG_H_RESULT}) + MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") + endif () +endif () \ No newline at end of file diff --git a/common.h b/common.h index c9cc2f0f2..4463141c8 100644 --- a/common.h +++ b/common.h @@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #endif #ifndef ASSEMBLER +#ifdef OS_WINDOWSSTORE +typedef char env_var_t[MAX_PATH]; +#define readenv(p, n) 0 +#else #ifdef OS_WINDOWS typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) @@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif +#endif #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK @@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){ *address = 0; } - +#ifdef OS_WINDOWSSTORE +static __inline int readenv_atoi(char *env) { + return 0; +} +#else #ifdef OS_WINDOWS static __inline int readenv_atoi(char *env) { env_var_t p; @@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) { return(0); } #endif - +#endif #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) diff --git a/common_arm.h b/common_arm.h index a17acb448..27fa76b76 100644 --- a/common_arm.h +++ b/common_arm.h @@ -111,11 +111,6 @@ REALNAME: #define PROFCODE -#ifdef __ARM_PCS -//-mfloat-abi=softfp -#define SOFT_FLOAT_ABI -#endif - #endif diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index ef9d58d76..e86b565f8 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; - BLASLONG range_m[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index a0377d638..5718c0ec9 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #endif blas_arg_t args; - blas_queue_t queue[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index f8ae3cdcd..035300841 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index bbb1c50eb..226a922e9 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index 47dc1daf9..c91b52775 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 42edb83cb..0a155366c 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 94274be72..5f40853dc 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)i; - width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; + width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; @@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)(arg -> n - i); - width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; + width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 489d40c76..8e0be1e0e 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -12,6 +12,8 @@ if (SMP) set(BLAS_SERVER blas_server_omp.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(BLAS_SERVER blas_server_win32.c) + elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") + set(BLAS_SERVER blas_server_win32.c) endif () if (NOT DEFINED BLAS_SERVER) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 081bdd7d4..cde8ca793 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ - WaitForSingleObject(blas_threads[i], 5); //INFINITE); - TerminateThread(blas_threads[i],0); + WaitForSingleObject(blas_threads[i], 5); //INFINITE); +#ifndef OS_WINDOWSSTORE +// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP + TerminateThread(blas_threads[i],0); +#endif } blas_server_avail = 0; diff --git a/driver/others/init.c b/driver/others/init.c index 9be6f52b0..3e6176967 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -354,6 +354,24 @@ static int numa_check(void) { return common -> num_nodes; } +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 6) +int sched_getcpu(void) +{ +int cpu; +FILE *fp = NULL; +if ( (fp = fopen("/proc/self/stat", "r")) == NULL) + return -1; +if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) { + fclose (fp); + return -1; + } + fclose (fp); + return(cpu); +} +#endif +#endif + static void numa_mapping(void) { int node, cpu, core; @@ -808,7 +826,6 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { - #ifdef DEBUG fprintf(stderr, "Shared Memory Initialization.\n"); #endif @@ -830,7 +847,7 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); common -> final_num_procs = 0; - for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 16bde105b..960dae67b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,7 +1,5 @@ +include $(KERNELDIR)/KERNEL.ARMV5 - - -############################################################################### SAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = iamax_vfp.S CAMAXKERNEL = iamax_vfp.S @@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S @@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S @@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S -ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S - SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o +endif SGEMMONCOPY = sgemm_ncopy_2_vfp.S -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x2_vfp.S +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o +endif DGEMMONCOPY = dgemm_ncopy_2_vfp.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o @@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d5cd94fbd..5e0b4cfb8 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,91 +1,12 @@ - -################################################################################# -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S - -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S - -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S - -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S - -ISAMAXKERNEL = iamax_vfp.S -IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S - -ISAMINKERNEL = iamax_vfp.S -IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S - -ISMAXKERNEL = iamax_vfp.S -IDMAXKERNEL = iamax_vfp.S - -ISMINKERNEL = iamax_vfp.S -IDMINKERNEL = iamax_vfp.S - -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S - -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c - -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S +include $(KERNELDIR)/KERNEL.ARMV6 SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S - -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S -ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index fe6242a5b..5b08e5028 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -475,6 +475,14 @@ asum_kernel_L999: vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif + bx lr EPILOGUE diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 8e5334f62..37515f399 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_INC_X [fp, #0 ] -#define OLD_Y [fp, #4 ] -#define OLD_INC_Y [fp, #8 ] -#else +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 #define OLD_X [fp, #0 ] #define OLD_INC_X [fp, #4 ] #define OLD_Y [fp, #8 ] #define OLD_INC_Y [fp, #12 ] +#else +#define OLD_ALPHA [fp, #0] +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define OLD_Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] #endif - + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_ALPHAR [fp, #0] +#define OLD_ALPHAI [fp, #8] +#define OLD_X [fp, #16 ] +#define OLD_INC_X [fp, #20 ] +#define OLD_Y [fp, #24 ] +#define OLD_INC_Y [fp, #28 ] +#endif + +#endif //!defined(COMPLEX) + +#else //__ARM_PCS_VFP + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + +#endif //!defined(__ARM_PCS_VFP) + #define N r0 #define Y r1 #define INC_X r2 @@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define FMAC_R1 fmacd -#define FMAC_R2 fnmacd +#define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs -#define FMAC_R2 fnmacs +#define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd -#define FMAC_I1 fnmacd +#define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs #define FMAC_R2 fmacs -#define FMAC_I1 fnmacs +#define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 sub sp, sp, #STACKSIZE // reserve stack -#ifdef ARM_SOFTFP_ABI -#ifndef DOUBLE - vmov s0, r3 //move alpha to s0 +#if !defined(__ARM_PCS_VFP) +#if !defined(COMPLEX) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA + ldr X, OLD_X +#else + vldr d0, OLD_ALPHA ldr X, OLD_X #endif +#else //COMPLEX +#if !defined(DOUBLE) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr X, OLD_X +#else + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr X, OLD_X #endif - +#endif +#endif + ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index 0497b6d83..e5a6e4d35 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov s2, s0 vmov s3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y +#endif cmp N, #0 ble cdot_kernel_L999 @@ -265,7 +283,6 @@ cdot_kernel_S10: cdot_kernel_L999: - sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers @@ -276,8 +293,11 @@ cdot_kernel_L999: vadd.f32 s0 , s0, s2 vsub.f32 s1 , s1, s3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {s0 - s1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index f0517cb47..71bc50efd 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index cf132a184..9d473ad78 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs - #define FMAC_R2 fnmacs + #define FMAC_R1 vmls.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs + #define FMAC_R1 vmls.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs - #define FMAC_I2 fnmacs + #define FMAC_I1 vmls.f32 + #define FMAC_I2 vmls.f32 #endif @@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 5d2748644..62ee33bb9 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr s0 , ALPHA_R diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 76c8a8f18..c07b6d6f8 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble cgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index 8cb7ede9d..aae890ea9 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 97bd88c69..79e7ed07f 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls - #define FMAC_R2 fnmacs + #define FMAC_R1 vnmul.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuls #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls + #define FMAC_I1 vnmul.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmuls - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls #define FMAC_I2 fmacs @@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls + #define FMAC_R1 vnmul.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls - #define FMAC_I2 fnmacs + #define FMAC_I1 vnmul.f32 + #define FMAC_I2 vmls.f32 #endif @@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index f28acbae3..fb294d8b4 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -246,6 +246,9 @@ ddot_kernel_L999: vldm r3, { d8 - d15} // restore floating point registers vadd.f64 d0 , d0, d1 // set return value +#if !defined(__ARM_PCS_VFP) + vmov r0, r1, d0 +#endif sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 183269d1b..001a6050c 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] - +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index b14052e06..1744b54d8 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index c578d2b1e..3d6fbf8e9 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c7e455f16..c0c6a1677 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 385370b7f..7c154d741 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_M r0 @@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 93bf23e49..54f958b7b 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_M r0 #define AO1 r0 @@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A str OLD_M, M - - + ldr INC_X , OLD_INC_X ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 816be54ff..9559d1829 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard abi -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_N r1 #define M r0 @@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index 7ae5799bc..b1d3dadf1 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_N r1 @@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index b3bd28152..16ac5a632 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -573,6 +573,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 +#endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif #endif bx lr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 7af966895..84977901d 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -503,8 +503,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 -#ifdef ARM_SOFTFP_ABI - vmov r0, s0 +#endif + +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else + vmov r0, s0 #endif #endif diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index d053423b6..25f563690 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_INC_Y [fp, #0 ] +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) +#define OLD_C [fp, #4] +#define OLD_S [fp, #8] +#else +#define OLD_C [fp, #8] +#define OLD_S [fp, #16] +#endif +#endif #define N r0 #define X r1 @@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X, { d2 } fstmiad Y, { d3 } @@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X, { s2 } fstmias Y, { s3 } @@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 vstr d2 , [ X, #0 ] vstr d3 , [ Y, #0 ] vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 vstr d2 , [ X, #8 ] vstr d3 , [ Y, #8 ] @@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 vstr s2 , [ X, #0 ] vstr s3 , [ Y, #0 ] vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 vstr s2 , [ X, #4 ] vstr s3 , [ Y, #4 ] @@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 ldr INC_Y , OLD_INC_Y - +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vldr s0, OLD_C + vldr s1, OLD_S +#else + vldr d0, OLD_C + vldr d1, OLD_S +#endif +#endif cmp N, #0 ble rot_kernel_L999 diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index a8939c3a2..cc3e3b98d 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X, { d2 - d3 } @@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X, { s2 - s3 } diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index f3abdc197..5f4f424bf 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -329,20 +329,19 @@ sdot_kernel_L999: vldm r3, { s8 - s15} // restore floating point registers #if defined(DSDOT) - vadd.f64 d0 , d0, d1 // set return value - -#ifdef ARM_SOFTFP_ABI - vmov r0, r1, d0 +#else + vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if defined(DSDOT) + vmov r0, r1, d0 #else - - vadd.f32 s0 , s0, s1 // set return value -#ifdef ARM_SOFTFP_ABI vmov r0, s0 #endif #endif + sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index e8b44b742..1f21e5a1f 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 86198ac90..6491d3571 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 - -#ifdef ARM_SOFTFP_ABI -#define OLD_ALPHA r3 -//#define OLD_A -#else //hard #define OLD_A r3 #define OLD_ALPHA s0 -#endif /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] - -#ifndef ARM_SOFTFP_ABI #define A [fp, #-268 ] -#endif #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, #-240] @@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] -#ifdef ARM_SOFTFP_ABI -#define A [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] #define B [fp, #8 ] #define C [fp, #12 ] #define OLD_LDC [fp, #16 ] -#else //hard +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif - + #define I r0 #define J r1 #define L r2 @@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_M, M str OLD_N, N str OLD_K, K - -#ifdef ARM_SOFTFP_ABI - str OLD_ALPHA, ALPHA -#else //hard str OLD_A, A vstr OLD_ALPHA, ALPHA -#endif + sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 8f97644ec..635b1dd13 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define OLD_C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index 0dd03ac85..e24d24eba 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 352875188..76661da79 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_X [fp, #0 ] +#define OLD_INC_X [fp, #4 ] +#define OLD_Y [fp, #8 ] +#define OLD_INC_Y [fp, #12 ] +#else +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12] +#define OLD_Y [fp, #16] +#define OLD_INC_Y [fp, #20] +#endif + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_X [fp, #16] +#define OLD_INC_X [fp, #20] +#define OLD_Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#endif // !defined(__ARM_PCS_VFP) + +#else #define OLD_INC_X [fp, #0 ] #define OLD_Y [fp, #4 ] #define OLD_INC_Y [fp, #8 ] +#endif #define N r0 @@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. push {r4 , fp} add fp, sp, #8 +#if !defined(__ARM_PCS_VFP) + ldr X, OLD_X +#endif ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 936ce9f60..43f2c0c0b 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vcvt.f64.f32 d2, s0 vcvt.f64.f32 d3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - +#endif cmp N, #0 ble zdot_kernel_L999 @@ -280,8 +297,11 @@ zdot_kernel_L999: vadd.f64 d0 , d0, d2 vsub.f64 d1 , d1, d3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {d0 - d1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 46507c4d2..53d18b07b 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 5a99f792f..a9d4eddeb 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd - #define FMAC_R2 fnmacd + #define FMAC_R1 vmls.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd @@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd + #define FMAC_R1 vmls.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd - #define FMAC_I2 fnmacd + #define FMAC_I1 vmls.f64 + #define FMAC_I2 vmls.f64 #endif @@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index da9a91043..3e3a1bc07 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr d0 , ALPHA_R diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 211fa0701..2193083af 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble zgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index dc80b17b8..cb6bc050e 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 5a808ccbc..3e6962f06 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld - #define FMAC_R2 fnmacd + #define FMAC_R1 vnmul.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuld #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld + #define FMAC_I1 vnmul.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmuld - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld #define FMAC_I2 fmacd @@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld + #define FMAC_R1 vnmul.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld - #define FMAC_I2 fnmacd + #define FMAC_I1 vnmul.f64 + #define FMAC_I2 vmls.f64 #endif @@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c index 93ba50660..7d12c9885 100644 --- a/kernel/power/casum_microk_power8.c +++ b/kernel/power/casum_microk_power8.c @@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c index b2b1bead1..613c4d286 100644 --- a/kernel/power/ccopy_microk_power8.c +++ b/kernel/power/ccopy_microk_power8.c @@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %2, %2, 128 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c index 1dd03dc88..8d7d0c0b9 100644 --- a/kernel/power/cswap_microk_power8.c +++ b/kernel/power/cswap_microk_power8.c @@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "lxvw4x 40, 0, %4 \n\t" - "lxvw4x 41, %5, %4 \n\t" - "lxvw4x 42, %6, %4 \n\t" - "lxvw4x 43, %7, %4 \n\t" - "lxvw4x 44, %8, %4 \n\t" - "lxvw4x 45, %9, %4 \n\t" - "lxvw4x 46, %10, %4 \n\t" - "lxvw4x 47, %11, %4 \n\t" + "lxvd2x 40, 0, %4 \n\t" + "lxvd2x 41, %5, %4 \n\t" + "lxvd2x 42, %6, %4 \n\t" + "lxvd2x 43, %7, %4 \n\t" + "lxvd2x 44, %8, %4 \n\t" + "lxvd2x 45, %9, %4 \n\t" + "lxvd2x 46, %10, %4 \n\t" + "lxvd2x 47, %11, %4 \n\t" "addi %4, %4, -128 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 49, %5, %3 \n\t" - "lxvw4x 50, %6, %3 \n\t" - "lxvw4x 51, %7, %3 \n\t" - "lxvw4x 0, %8, %3 \n\t" - "lxvw4x 1, %9, %3 \n\t" - "lxvw4x 2, %10, %3 \n\t" - "lxvw4x 3, %11, %3 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "lxvd2x 0, %8, %3 \n\t" + "lxvd2x 1, %9, %3 \n\t" + "lxvd2x 2, %10, %3 \n\t" + "lxvd2x 3, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "lxvw4x 4, 0, %3 \n\t" - "lxvw4x 5, %5, %3 \n\t" - "lxvw4x 6, %6, %3 \n\t" - "lxvw4x 7, %7, %3 \n\t" - "lxvw4x 8, %8, %3 \n\t" - "lxvw4x 9, %9, %3 \n\t" - "lxvw4x 10, %10, %3 \n\t" - "lxvw4x 11, %11, %3 \n\t" + "lxvd2x 4, 0, %3 \n\t" + "lxvd2x 5, %5, %3 \n\t" + "lxvd2x 6, %6, %3 \n\t" + "lxvd2x 7, %7, %3 \n\t" + "lxvd2x 8, %8, %3 \n\t" + "lxvd2x 9, %9, %3 \n\t" + "lxvd2x 10, %10, %3 \n\t" + "lxvd2x 11, %11, %3 \n\t" "addi %3, %3, -128 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 48, 0, %4 \n\t" - "stxvw4x 49, %5, %4 \n\t" - "stxvw4x 50, %6, %4 \n\t" - "stxvw4x 51, %7, %4 \n\t" - "stxvw4x 0, %8, %4 \n\t" - "stxvw4x 1, %9, %4 \n\t" - "stxvw4x 2, %10, %4 \n\t" - "stxvw4x 3, %11, %4 \n\t" + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 0, %8, %4 \n\t" + "stxvd2x 1, %9, %4 \n\t" + "stxvd2x 2, %10, %4 \n\t" + "stxvd2x 3, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "stxvw4x 4, 0, %4 \n\t" - "stxvw4x 5, %5, %4 \n\t" - "stxvw4x 6, %6, %4 \n\t" - "stxvw4x 7, %7, %4 \n\t" - "stxvw4x 8, %8, %4 \n\t" - "stxvw4x 9, %9, %4 \n\t" - "stxvw4x 10, %10, %4 \n\t" - "stxvw4x 11, %11, %4 \n\t" + "stxvd2x 4, 0, %4 \n\t" + "stxvd2x 5, %5, %4 \n\t" + "stxvd2x 6, %6, %4 \n\t" + "stxvd2x 7, %7, %4 \n\t" + "stxvd2x 8, %8, %4 \n\t" + "stxvd2x 9, %9, %4 \n\t" + "stxvd2x 10, %10, %4 \n\t" + "stxvd2x 11, %11, %4 \n\t" "addi %4, %4, 128 \n\t" diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index 08a766f80..4bb515de8 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 444a6d4d5..7a54d5e1e 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index 7f7ccfac3..bfe100c8b 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" @@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y) "1: \n\t" "xvmaddasp 32, 40, 48 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" "xvmaddasp 33, 41, 49 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" "xvmaddasp 34, 42, 50 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" "xvmaddasp 35, 43, 51 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" "xvmaddasp 36, 44, %x4 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" "xvmaddasp 37, 45, %x5 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" "xvmaddasp 38, 46, %x6 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" "xvmaddasp 39, 47, %x7 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 0a18c16e0..6eecb60a1 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xscvdpspn 37, %x14 \n\t" // load s to all words "xxspltw 37, 37, 0 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "addi %3, %3, 64 \n\t" "addi %4, %4, 64 \n\t" @@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 45, 33, 37 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" "xvmulsp 46, 34, 37 \n\t" "xvmulsp 47, 35, 37 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" "xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x10, 49, 37 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" "xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y @@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n\t" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n\t" "addi %3, %3, 128 \n\t" "addi %4, %4, 128 \n\t" @@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 49862a329..058ff3399 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xscvdpspn %x3, %x3 \n\t" "xxspltw %x3, %x3, 0 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" "xvmulsp 42, 34, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" "xvmulsp 44, 36, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, -128 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n\t" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n\t" "addi %2, %2, 256 \n\t" @@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : @@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x) ".p2align 5 \n" "1: \n\t" - "stxvw4x %x3, 0, %2 \n\t" - "stxvw4x %x3, %4, %2 \n\t" - "stxvw4x %x3, %5, %2 \n\t" - "stxvw4x %x3, %6, %2 \n\t" - "stxvw4x %x3, %7, %2 \n\t" - "stxvw4x %x3, %8, %2 \n\t" - "stxvw4x %x3, %9, %2 \n\t" - "stxvw4x %x3, %10, %2 \n\t" + "stxvd2x %x3, 0, %2 \n\t" + "stxvd2x %x3, %4, %2 \n\t" + "stxvd2x %x3, %5, %2 \n\t" + "stxvd2x %x3, %6, %2 \n\t" + "stxvd2x %x3, %7, %2 \n\t" + "stxvd2x %x3, %8, %2 \n\t" + "stxvd2x %x3, %9, %2 \n\t" + "stxvd2x %x3, %10, %2 \n\t" "addi %2, %2, 128 \n\t" diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index d44f16765..cfefdd6ef 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" - "lxvw4x 40, 0, %3 \n\t" - "lxvw4x 41, %5, %3 \n\t" - "lxvw4x 42, %6, %3 \n\t" - "lxvw4x 43, %7, %3 \n\t" - "lxvw4x 44, %8, %3 \n\t" - "lxvw4x 45, %9, %3 \n\t" - "lxvw4x 46, %10, %3 \n\t" - "lxvw4x 47, %11, %3 \n\t" + "lxvd2x 40, 0, %3 \n\t" + "lxvd2x 41, %5, %3 \n\t" + "lxvd2x 42, %6, %3 \n\t" + "lxvd2x 43, %7, %3 \n\t" + "lxvd2x 44, %8, %3 \n\t" + "lxvd2x 45, %9, %3 \n\t" + "lxvd2x 46, %10, %3 \n\t" + "lxvd2x 47, %11, %3 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %4 \n\t" - "stxvw4x 41, %5, %4 \n\t" - "stxvw4x 42, %6, %4 \n\t" - "stxvw4x 43, %7, %4 \n\t" - "stxvw4x 44, %8, %4 \n\t" - "stxvw4x 45, %9, %4 \n\t" - "stxvw4x 46, %10, %4 \n\t" - "stxvw4x 47, %11, %4 \n\t" + "stxvd2x 40, 0, %4 \n\t" + "stxvd2x 41, %5, %4 \n\t" + "stxvd2x 42, %6, %4 \n\t" + "stxvd2x 43, %7, %4 \n\t" + "stxvd2x 44, %8, %4 \n\t" + "stxvd2x 45, %9, %4 \n\t" + "stxvd2x 46, %10, %4 \n\t" + "stxvd2x 47, %11, %4 \n\t" "addi %4, %4, 128 \n\t" diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 9cf518e05..bd31ed9c6 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(${OpenBLAS_utest_bin} m) endif() +if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") +set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS") +endif() + #Set output for utest set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})