Merge branch 'develop' into relapack

This commit is contained in:
Martin Kroeker 2017-07-13 21:18:02 +02:00 committed by GitHub
commit aaa65e06f1
65 changed files with 1252 additions and 827 deletions

View File

@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
) )
ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h)
ADD_CUSTOM_TARGET(genconfig
ALL
DEPENDS openblas_config.h
)
add_dependencies(genconfig ${OpenBLAS_LIBNAME}) add_dependencies(genconfig ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME}
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(genf77blas ADD_CUSTOM_TARGET(genf77blas
ALL
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
) )
@ -255,11 +260,11 @@ if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(gencblas ADD_CUSTOM_TARGET(gencblas
ALL
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
) )
add_dependencies(gencblas ${OpenBLAS_LIBNAME}) add_dependencies(gencblas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

View File

@ -1,5 +1,4 @@
#ifeq logical or ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android) ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a
@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
endif endif
endif endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
ifeq ($(ARM_SOFTFP_ABI), 1)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
else
CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
endif
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV6) ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6 CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp -march=armv6
endif endif
ifeq ($(CORE), ARMV5) ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -march=armv5 CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -marm -march=armv5 FCOMMON_OPT += -march=armv5
endif endif

View File

@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
endif endif
ifeq ($(CORE), THUNDERX2T99) ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
endif endif

View File

@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF}
"#define __${BINARY}BIT__\t1\n" "#define __${BINARY}BIT__\t1\n"
"#define FUNDERSCORE\t${FU}\n") "#define FUNDERSCORE\t${FU}\n")
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
file(APPEND ${TARGET_CONF}
"#define OS_WINNT\t1\n")
endif ()

View File

@ -77,7 +77,7 @@ if (CYGWIN)
set(NO_EXPRECISION 1) set(NO_EXPRECISION 1)
endif () endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
if (SMP) if (SMP)
set(EXTRALIB "${EXTRALIB} -lpthread") set(EXTRALIB "${EXTRALIB} -lpthread")
endif () endif ()

View File

@ -72,20 +72,26 @@ if (MSVC)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
endif() endif()
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
# disable WindowsStore strict CRT checks
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
endif ()
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR}) file(MAKE_DIRECTORY ${GETARCH_DIR})
try_compile(GETARCH_RESULT ${GETARCH_DIR} if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC} SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
) )
if (NOT ${GETARCH_RESULT}) if (NOT ${GETARCH_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif ()
endif () endif ()
message(STATUS "Running getarch") message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR}) file(MAKE_DIRECTORY ${GETARCH2_DIR})
try_compile(GETARCH2_RESULT ${GETARCH2_DIR} if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
) )
if (NOT ${GETARCH2_RESULT}) if (NOT ${GETARCH2_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
endif ()
endif () endif ()
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GEN_CONFIG_H_LOG OUTPUT_VARIABLE GEN_CONFIG_H_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
) )
if (NOT ${GEN_CONFIG_H_RESULT}) if (NOT ${GEN_CONFIG_H_RESULT})
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
endif ()
endif () endif ()

View File

@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#endif #endif
#ifndef ASSEMBLER #ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH]; typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
typedef char* env_var_t; typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n)) #define readenv(p, n) ((p)=getenv(n))
#endif #endif
#endif
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
#ifdef _POSIX_MONOTONIC_CLOCK #ifdef _POSIX_MONOTONIC_CLOCK
@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
*address = 0; *address = 0;
} }
#ifdef OS_WINDOWSSTORE
static __inline int readenv_atoi(char *env) {
return 0;
}
#else
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
static __inline int readenv_atoi(char *env) { static __inline int readenv_atoi(char *env) {
env_var_t p; env_var_t p;
@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
return(0); return(0);
} }
#endif #endif
#endif
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

View File

@ -111,11 +111,6 @@ REALNAME:
#define PROFCODE #define PROFCODE
#ifdef __ARM_PCS
//-mfloat-abi=softfp
#define SOFT_FLOAT_ABI
#endif
#endif #endif

View File

@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
blas_arg_t args; blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu; BLASLONG width, i, num_cpu;

View File

@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#endif #endif
blas_arg_t args; blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER + 1];
BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG range_n[MAX_CPU_NUMBER];

View File

@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
blas_arg_t args; blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu; BLASLONG width, i, num_cpu;

View File

@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
blas_arg_t args; blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu; BLASLONG width, i, num_cpu;

View File

@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
blas_arg_t args; blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu; BLASLONG width, i, num_cpu;

View File

@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
blas_arg_t args; blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu; BLASLONG width, i, num_cpu;

View File

@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) { if (nthreads - num_cpu > 1) {
di = (double)i; di = (double)i;
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i; if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) { if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i); di = (double)(arg -> n - i);
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i; if ((width <= 0) || (width > n_to - i)) width = n_to - i;

View File

@ -12,6 +12,8 @@ if (SMP)
set(BLAS_SERVER blas_server_omp.c) set(BLAS_SERVER blas_server_omp.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(BLAS_SERVER blas_server_win32.c) set(BLAS_SERVER blas_server_win32.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
set(BLAS_SERVER blas_server_win32.c)
endif () endif ()
if (NOT DEFINED BLAS_SERVER) if (NOT DEFINED BLAS_SERVER)

View File

@ -444,7 +444,10 @@ int BLASFUNC(blas_thread_shutdown)(void){
for(i = 0; i < blas_num_threads - 1; i++){ for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], 5); //INFINITE); WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0); TerminateThread(blas_threads[i],0);
#endif
} }
blas_server_avail = 0; blas_server_avail = 0;

View File

@ -354,6 +354,24 @@ static int numa_check(void) {
return common -> num_nodes; return common -> num_nodes;
} }
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 6)
int sched_getcpu(void)
{
int cpu;
FILE *fp = NULL;
if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
return -1;
if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
fclose (fp);
return -1;
}
fclose (fp);
return(cpu);
}
#endif
#endif
static void numa_mapping(void) { static void numa_mapping(void) {
int node, cpu, core; int node, cpu, core;
@ -808,7 +826,6 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid; common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) { if (common -> magic != SH_MAGIC) {
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Shared Memory Initialization.\n"); fprintf(stderr, "Shared Memory Initialization.\n");
#endif #endif

View File

@ -1,7 +1,5 @@
include $(KERNELDIR)/KERNEL.ARMV5
###############################################################################
SAMAXKERNEL = iamax_vfp.S SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S CAMAXKERNEL = iamax_vfp.S
@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = copy.c SROTKERNEL = rot_vfp.S
DCOPYKERNEL = copy.c DROTKERNEL = rot_vfp.S
CCOPYKERNEL = zcopy.c CROTKERNEL = rot_vfp.S
ZCOPYKERNEL = zcopy.c ZROTKERNEL = rot_vfp.S
SDOTKERNEL = sdot_vfp.S SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S DDOTKERNEL = ddot_vfp.S
@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S
CNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S
ZNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal.c
DSCALKERNEL = scal.c
CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
SSWAPKERNEL = swap_vfp.S SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S
@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMINCOPY = sgemm_ncopy_4_vfp.S
SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o
endif
SGEMMONCOPY = sgemm_ncopy_2_vfp.S SGEMMONCOPY = sgemm_ncopy_2_vfp.S
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMINCOPY = dgemm_ncopy_4_vfp.S
DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o
endif
DGEMMONCOPY = dgemm_ncopy_2_vfp.S DGEMMONCOPY = dgemm_ncopy_2_vfp.S
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o
@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRMMKERNEL = strmm_kernel_4x2_vfp.S
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -1,91 +1,12 @@
include $(KERNELDIR)/KERNEL.ARMV6
#################################################################################
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S
SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S
SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S
SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S
ISMINKERNEL = iamax_vfp.S
IDMINKERNEL = iamax_vfp.S
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S
SASUMKERNEL = asum_vfp.S
DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S
SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
CDOTKERNEL = cdot_vfp.S
ZDOTKERNEL = zdot_vfp.S
SNRM2KERNEL = nrm2_vfpv3.S SNRM2KERNEL = nrm2_vfpv3.S
DNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S
CNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S
ZNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal.c
DSCALKERNEL = scal.c
CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
SGEMVNKERNEL = gemv_n_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S
DGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S
@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S

View File

@ -475,6 +475,14 @@ asum_kernel_L999:
vadd.f32 s0 , s0, s1 // set return value vadd.f32 s0 , s0, s1 // set return value
#endif #endif
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif
bx lr bx lr
EPILOGUE EPILOGUE

View File

@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#ifndef ARM_SOFTFP_ABI #if !defined(__ARM_PCS_VFP)
//hard
#define OLD_INC_X [fp, #0 ] #if !defined(COMPLEX)
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ] #if !defined(DOUBLE)
#else #define OLD_ALPHA r3
#define OLD_X [fp, #0 ] #define OLD_X [fp, #0 ]
#define OLD_INC_X [fp, #4 ] #define OLD_INC_X [fp, #4 ]
#define OLD_Y [fp, #8 ] #define OLD_Y [fp, #8 ]
#define OLD_INC_Y [fp, #12 ] #define OLD_INC_Y [fp, #12 ]
#else
#define OLD_ALPHA [fp, #0]
#define OLD_X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define OLD_Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#endif #endif
#else //COMPLEX
#if !defined(DOUBLE)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define OLD_Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#else
#define OLD_ALPHAR [fp, #0]
#define OLD_ALPHAI [fp, #8]
#define OLD_X [fp, #16 ]
#define OLD_INC_X [fp, #20 ]
#define OLD_Y [fp, #24 ]
#define OLD_INC_Y [fp, #28 ]
#endif
#endif //!defined(COMPLEX)
#else //__ARM_PCS_VFP
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
#endif //!defined(__ARM_PCS_VFP)
#define N r0 #define N r0
#define Y r1 #define Y r1
#define INC_X r2 #define INC_X r2
@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE) #if defined(DOUBLE)
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#else #else
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#else #else
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#endif #endif
@ -370,10 +404,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #8 add fp, sp, #8
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#ifdef ARM_SOFTFP_ABI #if !defined(__ARM_PCS_VFP)
#ifndef DOUBLE #if !defined(COMPLEX)
vmov s0, r3 //move alpha to s0 #if !defined(DOUBLE)
vmov s0, OLD_ALPHA
ldr X, OLD_X ldr X, OLD_X
#else
vldr d0, OLD_ALPHA
ldr X, OLD_X
#endif
#else //COMPLEX
#if !defined(DOUBLE)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr X, OLD_X
#else
vldr d0, OLD_ALPHAR
vldr d1, OLD_ALPHAI
ldr X, OLD_X
#endif
#endif #endif
#endif #endif

View File

@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r0 #define N r0
#define X r1 #define X r1
#define INC_X r2 #define INC_X r2
#define OLD_Y r3
/****************************************************** /******************************************************
* [fp, #-128] - [fp, #-64] is reserved * [fp, #-128] - [fp, #-64] is reserved
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* registers * registers
*******************************************************/ *******************************************************/
#if !defined(__ARM_PCS_VFP)
#define OLD_RETURN_ADDR r0
#define OLD_N r1
#define OLD_X r2
#define OLD_INC_X r3
#define OLD_Y [fp, #0 ]
#define OLD_INC_Y [fp, #4 ] #define OLD_INC_Y [fp, #4 ]
#define RETURN_ADDR r8
#else
#define OLD_Y r3
#define OLD_INC_Y [fp, #0 ]
#endif
#define I r5 #define I r5
#define Y r6 #define Y r6
@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5 .align 5
push {r4 - r9, fp} push {r4 - r9, fp}
add fp, sp, #24 add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128 sub r4, fp, #128
@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmov s2, s0 vmov s2, s0
vmov s3, s0 vmov s3, s0
#if !defined(__ARM_PCS_VFP)
mov RETURN_ADDR, OLD_RETURN_ADDR
mov N, OLD_N
mov X, OLD_X
mov INC_X, OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#else
mov Y, OLD_Y mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y ldr INC_Y, OLD_INC_Y
#endif
cmp N, #0 cmp N, #0
ble cdot_kernel_L999 ble cdot_kernel_L999
@ -265,7 +283,6 @@ cdot_kernel_S10:
cdot_kernel_L999: cdot_kernel_L999:
sub r3, fp, #128 sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers vldm r3, { s8 - s15} // restore floating point registers
@ -276,8 +293,11 @@ cdot_kernel_L999:
vadd.f32 s0 , s0, s2 vadd.f32 s0 , s0, s2
vsub.f32 s1 , s1, s3 vsub.f32 s1 , s1, s3
#endif #endif
#if !defined(__ARM_PCS_VFP)
vstm RETURN_ADDR, {s0 - s1}
#endif
sub sp, fp, #24 sub sp, fp, #28
pop {r4 - r9, fp} pop {r4 - r9, fp}
bx lr bx lr

View File

@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#else #else
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#endif #endif
@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs #define FADD_R fsubs
#define FADD_I fadds #define FADD_I fadds
#define FMAC_R1 fnmacs #define FMAC_R1 vmls.f32
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fnmacs #define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs #define FADD_I fsubs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs #define FADD_R fsubs
#define FADD_I fadds #define FADD_I fadds
#define FMAC_R1 fnmacs #define FMAC_R1 vmls.f32
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fnmacs #define FMAC_I2 vmls.f32
#endif #endif
@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_A_SOFTFP [fp, #4 ]
#define OLD_LDA [fp, #8 ]
#define X [fp, #12 ]
#define OLD_INC_X [fp, #16 ]
#define Y [fp, #20 ]
#define OLD_INC_Y [fp, #24 ]
#else
#define OLD_LDA [fp, #0 ] #define OLD_LDA [fp, #0 ]
#define X [fp, #4 ] #define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ] #define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ] #define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ] #define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3 #define OLD_A r3
#define OLD_M r0 #define OLD_M r0
@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ) #if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ) #elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ) #elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#else #else
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#endif #endif
@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble cgemvn_kernel_L999 ble cgemvn_kernel_L999
#if !defined(__ARM_PCS_VFP)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A str OLD_A, A
str OLD_M, M str OLD_M, M
vstr s0 , ALPHA_R vstr s0 , ALPHA_R

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR r3
#define OLD_ALPHAI [fp, #0 ]
#define OLD_A_SOFTFP [fp, #4 ]
#define OLD_LDA [fp, #8 ]
#define X [fp, #12 ]
#define OLD_INC_X [fp, #16 ]
#define Y [fp, #20 ]
#define OLD_INC_Y [fp, #24 ]
#else
#define OLD_LDA [fp, #0 ] #define OLD_LDA [fp, #0 ]
#define X [fp, #4 ] #define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ] #define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ] #define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ] #define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3 #define OLD_A r3
#define OLD_N r1 #define OLD_N r1
@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ) #if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ) #elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ) #elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#else #else
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#endif #endif
@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0 cmp OLD_N, #0
ble cgemvt_kernel_L999 ble cgemvt_kernel_L999
#if !defined(__ARM_PCS_VFP)
vmov s0, OLD_ALPHAR
vldr s1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A str OLD_A, A
str OLD_N, N str OLD_N, N

View File

@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#define OFFSET [fp, #24 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs #define FMAC_I1 fmacs
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
#define KMAC_R fmacs #define KMAC_R fmacs
#define KMAC_I fnmacs #define KMAC_I vmls.f32
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#else #else
#define KMAC_R fnmacs #define KMAC_R vmls.f32
#define KMAC_I fmacs #define KMAC_I fmacs
#define FMAC_R1 fmacs #define FMAC_R1 fmacs
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmacs #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#endif #endif
@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP r3
#define OLD_ALPHAI_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #8 ]
#define B [fp, #12 ]
#define C [fp, #16 ]
#define OLD_LDC [fp, #20 ]
#define OFFSET [fp, #24 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs #define FADD_R fsubs
#define FADD_I fadds #define FADD_I fadds
#define FMAC_R1 fnmuls #define FMAC_R1 vnmul.f32
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls #define FMAC_I1 fmuls
#define FMAC_I2 fnmacs #define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmuls #define FMAC_R1 fmuls
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmuls #define FMAC_I1 vnmul.f32
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs #define FADD_I fsubs
#define FMAC_R1 fmuls #define FMAC_R1 fmuls
#define FMAC_R2 fnmacs #define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls #define FMAC_I1 fmuls
#define FMAC_I2 fmacs #define FMAC_I2 fmacs
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs #define FADD_R fsubs
#define FADD_I fadds #define FADD_I fadds
#define FMAC_R1 fnmuls #define FMAC_R1 vnmul.f32
#define FMAC_R2 fmacs #define FMAC_R2 fmacs
#define FMAC_I1 fnmuls #define FMAC_I1 vnmul.f32
#define FMAC_I2 fnmacs #define FMAC_I2 vmls.f32
#endif #endif
@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -246,6 +246,9 @@ ddot_kernel_L999:
vldm r3, { d8 - d15} // restore floating point registers vldm r3, { d8 - d15} // restore floating point registers
vadd.f64 d0 , d0, d1 // set return value vadd.f64 d0 , d0, d1 // set return value
#if !defined(__ARM_PCS_VFP)
vmov r0, r1, d0
#endif
sub sp, fp, #24 sub sp, fp, #24
pop {r4 - r9, fp} pop {r4 - r9, fp}
bx lr bx lr

View File

@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280] #define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280] #define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ] #define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define OLD_C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#define OFFSET [fp, #28 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define OLD_C [fp, #8 ] #define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ] #define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP [fp, #4]
#define OLD_A_SOFTFP [fp, #12 ]
#define B [fp, #16 ]
#define OLD_C [fp, #20 ]
#define OLD_LDC [fp, #24 ]
#define OFFSET [fp, #28 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define OLD_C [fp, #8 ] #define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#else
#define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#else
#define OLD_LDA [fp, #0 ] #define OLD_LDA [fp, #0 ]
#define X [fp, #4 ] #define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ] #define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ] #define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ] #define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3 #define OLD_A r3
#define OLD_M r0 #define OLD_M r0
@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble gemvn_kernel_L999 ble gemvn_kernel_L999
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
#else
vldr d0, OLD_ALPHA
#endif
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A str OLD_A, A
str OLD_M, M str OLD_M, M

View File

@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#ifndef ARM_SOFTFP_ABI #if !defined(__ARM_PCS_VFP)
//hard
#define OLD_LDA [fp, #0 ] #if !defined(DOUBLE)
#define X [fp, #4 ] #define OLD_ALPHA r3
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#else
#define OLD_A_SOFTFP [fp, #0 ] #define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ] #define OLD_LDA [fp, #4 ]
#define X [fp, #8 ] #define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ] #define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ] #define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ] #define OLD_INC_Y [fp, #20 ]
#define OLD_ALPHA r3 #else
#define OLD_A r3 #define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif #endif
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_M r0 #define OLD_M r0
#define AO1 r0 #define AO1 r0
@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble gemvn_kernel_L999 ble gemvn_kernel_L999
#ifndef DOUBLE #if !defined(__ARM_PCS_VFP)
#ifdef ARM_SOFTFP_ABI #if !defined(DOUBLE)
vmov s0, OLD_ALPHA vmov s0, OLD_ALPHA
ldr OLD_A, OLD_A_SOFTFP #else
vldr d0, OLD_ALPHA
#endif #endif
ldr OLD_A, OLD_A_SOFTFP
#endif #endif
str OLD_A, A str OLD_A, A
str OLD_M, M str OLD_M, M
ldr INC_X , OLD_INC_X ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y ldr INC_Y , OLD_INC_Y

View File

@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#ifndef ARM_SOFTFP_ABI #if !defined(__ARM_PCS_VFP)
//hard abi
#define OLD_LDA [fp, #0 ] #if !defined(DOUBLE)
#define X [fp, #4 ] #define OLD_ALPHA r3
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#else
#define OLD_A_SOFTFP [fp, #0 ] #define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ] #define OLD_LDA [fp, #4 ]
#define X [fp, #8 ] #define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ] #define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ] #define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ] #define OLD_INC_Y [fp, #20 ]
#define OLD_ALPHA r3 #else
#define OLD_A r3 #define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif #endif
#else
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3
#define OLD_N r1 #define OLD_N r1
#define M r0 #define M r0
@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0 cmp OLD_N, #0
ble gemvt_kernel_L999 ble gemvt_kernel_L999
#ifndef DOUBLE #if !defined(__ARM_PCS_VFP)
#ifdef ARM_SOFTFP_ABI #if !defined(DOUBLE)
vmov s0, OLD_ALPHA vmov s0, OLD_ALPHA
ldr OLD_A, OLD_A_SOFTFP #else
vldr d0, OLD_ALPHA
#endif #endif
ldr OLD_A, OLD_A_SOFTFP
#endif #endif
str OLD_A, A str OLD_A, A

View File

@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_ALPHA r3
#define OLD_A_SOFTFP [fp, #0 ]
#define OLD_LDA [fp, #4 ]
#define X [fp, #8 ]
#define OLD_INC_X [fp, #12 ]
#define Y [fp, #16 ]
#define OLD_INC_Y [fp, #20 ]
#else
#define OLD_ALPHA [fp, #0 ]
#define OLD_A_SOFTFP [fp, #8 ]
#define OLD_LDA [fp, #12]
#define X [fp, #16]
#define OLD_INC_X [fp, #20]
#define Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#else
#define OLD_LDA [fp, #0 ] #define OLD_LDA [fp, #0 ]
#define X [fp, #4 ] #define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ] #define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ] #define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ] #define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3 #define OLD_A r3
#define OLD_N r1 #define OLD_N r1
@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0 cmp OLD_N, #0
ble gemvt_kernel_L999 ble gemvt_kernel_L999
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov s0, OLD_ALPHA
#else
vldr d0, OLD_ALPHA
#endif
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A str OLD_A, A
str OLD_N, N str OLD_N, N

View File

@ -573,6 +573,13 @@ nrm2_kernel_L999:
#else #else
vsqrt.f32 s1, s1 vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1 vmul.f32 s0, s0, s1
#endif
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif #endif
bx lr bx lr

View File

@ -503,7 +503,12 @@ nrm2_kernel_L999:
#else #else
vsqrt.f32 s1, s1 vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1 vmul.f32 s0, s0, s1
#ifdef ARM_SOFTFP_ABI #endif
#if !defined(__ARM_PCS_VFP)
#if defined(DOUBLE)
vmov r0, r1, d0
#else
vmov r0, s0 vmov r0, s0
#endif #endif
#endif #endif

View File

@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OLD_INC_Y [fp, #0 ] #define OLD_INC_Y [fp, #0 ]
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
#define OLD_C [fp, #4]
#define OLD_S [fp, #8]
#else
#define OLD_C [fp, #8]
#define OLD_S [fp, #16]
#endif
#endif
#define N r0 #define N r0
#define X r1 #define X r1
@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5 fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5 vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5 fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5 vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5 fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5 vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5 fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5 vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5 fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5 vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5 fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5 vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X, { d2 } fstmiad X, { d2 }
fstmiad Y, { d3 } fstmiad Y, { d3 }
@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5 fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5 vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5 fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5 vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5 fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5 vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5 fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5 vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5 fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5 vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5 fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5 vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X, { s2 } fstmias X, { s2 }
fstmias Y, { s3 } fstmias Y, { s3 }
@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6 fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6 vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5 vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7 fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7 vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5 vmls.f64 d3 , d1, d5
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6 fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6 vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5 vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7 fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7 vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5 vmls.f64 d3 , d1, d5
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6 fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6 vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5 vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7 fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7 vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5 vmls.f64 d3 , d1, d5
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6 fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6 vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5 vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7 fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7 vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5 vmls.f64 d3 , d1, d5
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6 fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6 vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5 vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7 fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7 vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5 vmls.f64 d3 , d1, d5
fstmiad X!, { d2 } fstmiad X!, { d2 }
fstmiad Y!, { d3 } fstmiad Y!, { d3 }
@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4 vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6 fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6 vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4 vmls.f64 d3 , d1, d4
vstr d2 , [ X, #0 ] vstr d2 , [ X, #0 ]
vstr d3 , [ Y, #0 ] vstr d3 , [ Y, #0 ]
vmul.f64 d2 , d0, d5 vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7 fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7 vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5 vmls.f64 d3 , d1, d5
vstr d2 , [ X, #8 ] vstr d2 , [ X, #8 ]
vstr d3 , [ Y, #8 ] vstr d3 , [ Y, #8 ]
@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6 fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6 vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5 vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7 fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7 vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5 vmls.f32 s3 , s1, s5
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6 fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6 vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5 vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7 fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7 vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5 vmls.f32 s3 , s1, s5
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6 fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6 vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5 vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7 fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7 vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5 vmls.f32 s3 , s1, s5
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6 fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6 vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5 vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7 fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7 vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5 vmls.f32 s3 , s1, s5
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6 fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6 vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5 vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7 fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7 vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5 vmls.f32 s3 , s1, s5
fstmias X!, { s2 } fstmias X!, { s2 }
fstmias Y!, { s3 } fstmias Y!, { s3 }
@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4 vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6 fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6 vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4 vmls.f32 s3 , s1, s4
vstr s2 , [ X, #0 ] vstr s2 , [ X, #0 ]
vstr s3 , [ Y, #0 ] vstr s3 , [ Y, #0 ]
vmul.f32 s2 , s0, s5 vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7 fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7 vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5 vmls.f32 s3 , s1, s5
vstr s2 , [ X, #4 ] vstr s2 , [ X, #4 ]
vstr s3 , [ Y, #4 ] vstr s3 , [ Y, #4 ]
@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #8 add fp, sp, #8
ldr INC_Y , OLD_INC_Y ldr INC_Y , OLD_INC_Y
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vldr s0, OLD_C
vldr s1, OLD_S
#else
vldr d0, OLD_C
vldr d1, OLD_S
#endif
#endif
cmp N, #0 cmp N, #0
ble rot_kernel_L999 ble rot_kernel_L999

View File

@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 } fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4 vmul.f64 d2, d0, d4
fnmacd d2, d1, d5 vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5 vmul.f64 d3, d0, d5
fmacd d3, d1, d4 fmacd d3, d1, d4
fstmiad X!, { d2 - d3 } fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 } fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4 vmul.f64 d2, d0, d4
fnmacd d2, d1, d5 vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5 vmul.f64 d3, d0, d5
fmacd d3, d1, d4 fmacd d3, d1, d4
fstmiad X!, { d2 - d3 } fstmiad X!, { d2 - d3 }
@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 } fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4 vmul.f64 d2, d0, d4
fnmacd d2, d1, d5 vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5 vmul.f64 d3, d0, d5
fmacd d3, d1, d4 fmacd d3, d1, d4
fstmiad X!, { d2 - d3 } fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 } fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4 vmul.f64 d2, d0, d4
fnmacd d2, d1, d5 vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5 vmul.f64 d3, d0, d5
fmacd d3, d1, d4 fmacd d3, d1, d4
fstmiad X!, { d2 - d3 } fstmiad X!, { d2 - d3 }
@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 } fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4 vmul.f64 d2, d0, d4
fnmacd d2, d1, d5 vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5 vmul.f64 d3, d0, d5
fmacd d3, d1, d4 fmacd d3, d1, d4
fstmiad X!, { d2 - d3 } fstmiad X!, { d2 - d3 }
@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 } fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4 vmul.f64 d2, d0, d4
fnmacd d2, d1, d5 vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5 vmul.f64 d3, d0, d5
fmacd d3, d1, d4 fmacd d3, d1, d4
fstmiad X, { d2 - d3 } fstmiad X, { d2 - d3 }
@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 } fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4 vmul.f32 s2, s0, s4
fnmacs s2, s1, s5 vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5 vmul.f32 s3, s0, s5
fmacs s3, s1, s4 fmacs s3, s1, s4
fstmias X!, { s2 - s3 } fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 } fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4 vmul.f32 s2, s0, s4
fnmacs s2, s1, s5 vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5 vmul.f32 s3, s0, s5
fmacs s3, s1, s4 fmacs s3, s1, s4
fstmias X!, { s2 - s3 } fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 } fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4 vmul.f32 s2, s0, s4
fnmacs s2, s1, s5 vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5 vmul.f32 s3, s0, s5
fmacs s3, s1, s4 fmacs s3, s1, s4
fstmias X!, { s2 - s3 } fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 } fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4 vmul.f32 s2, s0, s4
fnmacs s2, s1, s5 vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5 vmul.f32 s3, s0, s5
fmacs s3, s1, s4 fmacs s3, s1, s4
fstmias X!, { s2 - s3 } fstmias X!, { s2 - s3 }
@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 } fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4 vmul.f32 s2, s0, s4
fnmacs s2, s1, s5 vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5 vmul.f32 s3, s0, s5
fmacs s3, s1, s4 fmacs s3, s1, s4
fstmias X!, { s2 - s3 } fstmias X!, { s2 - s3 }
@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 } fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4 vmul.f32 s2, s0, s4
fnmacs s2, s1, s5 vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5 vmul.f32 s3, s0, s5
fmacs s3, s1, s4 fmacs s3, s1, s4
fstmias X, { s2 - s3 } fstmias X, { s2 - s3 }

View File

@ -329,20 +329,19 @@ sdot_kernel_L999:
vldm r3, { s8 - s15} // restore floating point registers vldm r3, { s8 - s15} // restore floating point registers
#if defined(DSDOT) #if defined(DSDOT)
vadd.f64 d0 , d0, d1 // set return value vadd.f64 d0 , d0, d1 // set return value
#else
#ifdef ARM_SOFTFP_ABI vadd.f32 s0 , s0, s1 // set return value
vmov r0, r1, d0
#endif #endif
#if !defined(__ARM_PCS_VFP)
#if defined(DSDOT)
vmov r0, r1, d0
#else #else
vadd.f32 s0 , s0, s1 // set return value
#ifdef ARM_SOFTFP_ABI
vmov r0, s0 vmov r0, s0
#endif #endif
#endif #endif
sub sp, fp, #24 sub sp, fp, #24
pop {r4 - r9, fp} pop {r4 - r9, fp}
bx lr bx lr

View File

@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280] #define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OLD_M r0 #define OLD_M r0
#define OLD_N r1 #define OLD_N r1
#define OLD_K r2 #define OLD_K r2
#ifdef ARM_SOFTFP_ABI
#define OLD_ALPHA r3
//#define OLD_A
#else //hard
#define OLD_A r3 #define OLD_A r3
#define OLD_ALPHA s0 #define OLD_ALPHA s0
#endif
/****************************************************** /******************************************************
* [fp, #-128] - [fp, #-64] is reserved * [fp, #-128] - [fp, #-64] is reserved
@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define M [fp, #-256 ] #define M [fp, #-256 ]
#define N [fp, #-260 ] #define N [fp, #-260 ]
#define K [fp, #-264 ] #define K [fp, #-264 ]
#ifndef ARM_SOFTFP_ABI
#define A [fp, #-268 ] #define A [fp, #-268 ]
#endif
#define FP_ZERO [fp, #-240] #define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, #-240] #define FP_ZERO_0 [fp, #-240]
@ -88,12 +79,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280] #define ALPHA [fp, #-280]
#ifdef ARM_SOFTFP_ABI #if !defined(__ARM_PCS_VFP)
#define A [fp, #4 ] #define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ] #define B [fp, #8 ]
#define C [fp, #12 ] #define C [fp, #12 ]
#define OLD_LDC [fp, #16 ] #define OLD_LDC [fp, #16 ]
#else //hard #else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K
#ifdef ARM_SOFTFP_ABI
str OLD_ALPHA, ALPHA
#else //hard
str OLD_A, A str OLD_A, A
vstr OLD_ALPHA, ALPHA vstr OLD_ALPHA, ALPHA
#endif
sub r3, fp, #128 sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers vstm r3, { s8 - s31} // store floating point registers

View File

@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ] #define ALPHA [fp, #-276 ]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define OLD_C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
#define OFFSET [fp, #20 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define OLD_C [fp, #8 ] #define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280] #define ALPHA [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHA_SOFTFP r3
#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
#define OFFSET [fp, #20 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#if !defined(COMPLEX)
#if !defined(DOUBLE)
#define OLD_X [fp, #0 ]
#define OLD_INC_X [fp, #4 ]
#define OLD_Y [fp, #8 ]
#define OLD_INC_Y [fp, #12 ]
#else
#define OLD_X [fp, #8 ]
#define OLD_INC_X [fp, #12]
#define OLD_Y [fp, #16]
#define OLD_INC_Y [fp, #20]
#endif
#else //COMPLEX
#if !defined(DOUBLE)
#define OLD_X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define OLD_Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#else
#define OLD_X [fp, #16]
#define OLD_INC_X [fp, #20]
#define OLD_Y [fp, #24]
#define OLD_INC_Y [fp, #28]
#endif
#endif // !defined(__ARM_PCS_VFP)
#else
#define OLD_INC_X [fp, #0 ] #define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ] #define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ] #define OLD_INC_Y [fp, #8 ]
#endif
#define N r0 #define N r0
@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
push {r4 , fp} push {r4 , fp}
add fp, sp, #8 add fp, sp, #8
#if !defined(__ARM_PCS_VFP)
ldr X, OLD_X
#endif
ldr INC_X , OLD_INC_X ldr INC_X , OLD_INC_X
ldr Y, OLD_Y ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y ldr INC_Y , OLD_INC_Y

View File

@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r0 #define N r0
#define X r1 #define X r1
#define INC_X r2 #define INC_X r2
#define OLD_Y r3
/****************************************************** /******************************************************
* [fp, #-128] - [fp, #-64] is reserved * [fp, #-128] - [fp, #-64] is reserved
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* registers * registers
*******************************************************/ *******************************************************/
#if !defined(__ARM_PCS_VFP)
#define OLD_RETURN_ADDR r0
#define OLD_N r1
#define OLD_X r2
#define OLD_INC_X r3
#define OLD_Y [fp, #0 ]
#define OLD_INC_Y [fp, #4 ] #define OLD_INC_Y [fp, #4 ]
#define RETURN_ADDR r8
#else
#define OLD_Y r3
#define OLD_INC_Y [fp, #0 ]
#endif
#define I r5 #define I r5
#define Y r6 #define Y r6
@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5 .align 5
push {r4 - r9, fp} push {r4 - r9, fp}
add fp, sp, #24 add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128 sub r4, fp, #128
@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vcvt.f64.f32 d2, s0 vcvt.f64.f32 d2, s0
vcvt.f64.f32 d3, s0 vcvt.f64.f32 d3, s0
#if !defined(__ARM_PCS_VFP)
mov RETURN_ADDR, OLD_RETURN_ADDR
mov N, OLD_N
mov X, OLD_X
mov INC_X, OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#else
mov Y, OLD_Y mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y ldr INC_Y, OLD_INC_Y
#endif
cmp N, #0 cmp N, #0
ble zdot_kernel_L999 ble zdot_kernel_L999
@ -280,8 +297,11 @@ zdot_kernel_L999:
vadd.f64 d0 , d0, d2 vadd.f64 d0 , d0, d2
vsub.f64 d1 , d1, d3 vsub.f64 d1 , d1, d3
#endif #endif
#if !defined(__ARM_PCS_VFP)
vstm RETURN_ADDR, {d0 - d1}
#endif
sub sp, fp, #24 sub sp, fp, #28
pop {r4 - r9, fp} pop {r4 - r9, fp}
bx lr bx lr

View File

@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#else #else
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#endif #endif
@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd #define FADD_R fsubd
#define FADD_I faddd #define FADD_I faddd
#define FMAC_R1 fnmacd #define FMAC_R1 vmls.f64
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fnmacd #define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubd #define FADD_I fsubd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd #define FADD_R fsubd
#define FADD_I faddd #define FADD_I faddd
#define FMAC_R1 fnmacd #define FMAC_R1 vmls.f64
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fnmacd #define FMAC_I2 vmls.f64
#endif #endif
@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR [fp, #0 ]
#define OLD_ALPHAI [fp, #8 ]
#define OLD_A_SOFTFP [fp, #16]
#define OLD_LDA [fp, #20]
#define X [fp, #24]
#define OLD_INC_X [fp, #28]
#define Y [fp, #32]
#define OLD_INC_Y [fp, #36]
#else
#define OLD_LDA [fp, #0 ] #define OLD_LDA [fp, #0 ]
#define X [fp, #4 ] #define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ] #define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ] #define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ] #define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3 #define OLD_A r3
#define OLD_M r0 #define OLD_M r0
@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ) #if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ) #elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ) #elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#else #else
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#endif #endif
@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0 cmp N, #0
ble zgemvn_kernel_L999 ble zgemvn_kernel_L999
#if !defined(__ARM_PCS_VFP)
vldr d0, OLD_ALPHAR
vldr d1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A str OLD_A, A
str OLD_M, M str OLD_M, M
vstr d0 , ALPHA_R vstr d0 , ALPHA_R

View File

@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256 #define STACKSIZE 256
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR [fp, #0 ]
#define OLD_ALPHAI [fp, #8 ]
#define OLD_A_SOFTFP [fp, #16]
#define OLD_LDA [fp, #20]
#define X [fp, #24]
#define OLD_INC_X [fp, #28]
#define Y [fp, #32]
#define OLD_INC_Y [fp, #36]
#else
#define OLD_LDA [fp, #0 ] #define OLD_LDA [fp, #0 ]
#define X [fp, #4 ] #define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ] #define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ] #define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ] #define OLD_INC_Y [fp, #16 ]
#endif
#define OLD_A r3 #define OLD_A r3
#define OLD_N r1 #define OLD_N r1
@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ) #if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ) #elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ) #elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#else #else
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#endif #endif
@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0 cmp OLD_N, #0
ble zgemvt_kernel_L999 ble zgemvt_kernel_L999
#if !defined(__ARM_PCS_VFP)
vldr d0, OLD_ALPHAR
vldr d1, OLD_ALPHAI
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A str OLD_A, A
str OLD_N, N str OLD_N, N

View File

@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#define OFFSET [fp, #36 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd #define FMAC_I1 fmacd
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
#define KMAC_R fmacd #define KMAC_R fmacd
#define KMAC_I fnmacd #define KMAC_I vmls.f64
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#else #else
#define KMAC_R fnmacd #define KMAC_R vmls.f64
#define KMAC_I fmacd #define KMAC_I fmacd
#define FMAC_R1 fmacd #define FMAC_R1 fmacd
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmacd #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#endif #endif
@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272] #define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280] #define ALPHA_R [fp, #-280]
#if !defined(__ARM_PCS_VFP)
#define OLD_ALPHAR_SOFTFP [fp, #4]
#define OLD_ALPHAI_SOFTFP [fp, #12]
#define OLD_A_SOFTFP [fp, #20 ]
#define B [fp, #24 ]
#define C [fp, #28 ]
#define OLD_LDC [fp, #32 ]
#define OFFSET [fp, #36 ]
#else
#define B [fp, #4 ] #define B [fp, #4 ]
#define C [fp, #8 ] #define C [fp, #8 ]
#define OLD_LDC [fp, #12 ] #define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ] #define OFFSET [fp, #16 ]
#endif
#define I r0 #define I r0
#define J r1 #define J r1
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd #define FADD_R fsubd
#define FADD_I faddd #define FADD_I faddd
#define FMAC_R1 fnmuld #define FMAC_R1 vnmul.f64
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld #define FMAC_I1 fmuld
#define FMAC_I2 fnmacd #define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT) #elif defined(CN) || defined(CT)
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmuld #define FMAC_R1 fmuld
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmuld #define FMAC_I1 vnmul.f64
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
#elif defined(NC) || defined(TC) #elif defined(NC) || defined(TC)
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubd #define FADD_I fsubd
#define FMAC_R1 fmuld #define FMAC_R1 fmuld
#define FMAC_R2 fnmacd #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld #define FMAC_I1 fmuld
#define FMAC_I2 fmacd #define FMAC_I2 fmacd
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd #define FADD_R fsubd
#define FADD_I faddd #define FADD_I faddd
#define FMAC_R1 fnmuld #define FMAC_R1 vnmul.f64
#define FMAC_R2 fmacd #define FMAC_R2 fmacd
#define FMAC_I1 fnmuld #define FMAC_I1 vnmul.f64
#define FMAC_I2 fnmacd #define FMAC_I2 vmls.f64
#endif #endif
@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24 add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack sub sp, sp, #STACKSIZE // reserve stack
#if !defined(__ARM_PCS_VFP)
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_M, M str OLD_M, M
str OLD_N, N str OLD_N, N
str OLD_K, K str OLD_K, K

View File

@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
"xxlxor 38, 38, 38 \n\t" "xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t" "xxlxor 39, 39, 39 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"lxvw4x 42, %9, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"lxvw4x 44, %11, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"lxvw4x 46, %13, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
"xvabssp 50, 42 \n\t" "xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t" "xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t" "xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t" "xvabssp %x4, 45 \n\t"
"lxvw4x 42, %9, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t" "xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t" "xvabssp %x6, 47 \n\t"
"lxvw4x 44, %11, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t" "xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t" "xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %13, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t" "xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t" "xvaddsp 35, 35, 51 \n\t"

View File

@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
{ {
__asm__ __asm__
( (
"lxvw4x 32, 0, %2 \n\t" "lxvd2x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t" "lxvd2x 33, %5, %2 \n\t"
"lxvw4x 34, %6, %2 \n\t" "lxvd2x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t" "lxvd2x 35, %7, %2 \n\t"
"lxvw4x 36, %8, %2 \n\t" "lxvd2x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t" "lxvd2x 37, %9, %2 \n\t"
"lxvw4x 38, %10, %2 \n\t" "lxvd2x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t" "lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvd2x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvd2x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvd2x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvd2x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvd2x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"stxvw4x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
"lxvw4x 32, 0, %2 \n\t" "lxvd2x 32, 0, %2 \n\t"
"lxvw4x 33, %5, %2 \n\t" "lxvd2x 33, %5, %2 \n\t"
"stxvw4x 34, %6, %3 \n\t" "stxvd2x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t" "stxvd2x 35, %7, %3 \n\t"
"lxvw4x 34, %6, %2 \n\t" "lxvd2x 34, %6, %2 \n\t"
"lxvw4x 35, %7, %2 \n\t" "lxvd2x 35, %7, %2 \n\t"
"stxvw4x 36, %8, %3 \n\t" "stxvd2x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t" "stxvd2x 37, %9, %3 \n\t"
"lxvw4x 36, %8, %2 \n\t" "lxvd2x 36, %8, %2 \n\t"
"lxvw4x 37, %9, %2 \n\t" "lxvd2x 37, %9, %2 \n\t"
"stxvw4x 38, %10, %3 \n\t" "stxvd2x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t" "stxvd2x 39, %11, %3 \n\t"
"lxvw4x 38, %10, %2 \n\t" "lxvd2x 38, %10, %2 \n\t"
"lxvw4x 39, %11, %2 \n\t" "lxvd2x 39, %11, %2 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %3 \n\t" "stxvd2x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t" "stxvd2x 43, %7, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvd2x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvd2x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %3 \n\t" "stxvd2x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t" "stxvd2x 45, %9, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvd2x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvd2x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %3 \n\t" "stxvd2x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n\t" "stxvd2x 47, %11, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvd2x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"2: \n\t" "2: \n\t"
"stxvw4x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t" "stxvd2x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t" "stxvd2x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t" "stxvd2x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t" "stxvd2x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t" "stxvd2x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t" "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t" "stxvd2x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t" "stxvd2x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t" "stxvd2x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t" "stxvd2x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t" "stxvd2x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n" "stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

View File

@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"lxvw4x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvw4x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
"lxvw4x 34, %6, %4 \n\t" "lxvd2x 34, %6, %4 \n\t"
"lxvw4x 35, %7, %4 \n\t" "lxvd2x 35, %7, %4 \n\t"
"lxvw4x 36, %8, %4 \n\t" "lxvd2x 36, %8, %4 \n\t"
"lxvw4x 37, %9, %4 \n\t" "lxvd2x 37, %9, %4 \n\t"
"lxvw4x 38, %10, %4 \n\t" "lxvd2x 38, %10, %4 \n\t"
"lxvw4x 39, %11, %4 \n\t" "lxvd2x 39, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"lxvw4x 40, 0, %4 \n\t" "lxvd2x 40, 0, %4 \n\t"
"lxvw4x 41, %5, %4 \n\t" "lxvd2x 41, %5, %4 \n\t"
"lxvw4x 42, %6, %4 \n\t" "lxvd2x 42, %6, %4 \n\t"
"lxvw4x 43, %7, %4 \n\t" "lxvd2x 43, %7, %4 \n\t"
"lxvw4x 44, %8, %4 \n\t" "lxvd2x 44, %8, %4 \n\t"
"lxvw4x 45, %9, %4 \n\t" "lxvd2x 45, %9, %4 \n\t"
"lxvw4x 46, %10, %4 \n\t" "lxvd2x 46, %10, %4 \n\t"
"lxvw4x 47, %11, %4 \n\t" "lxvd2x 47, %11, %4 \n\t"
"addi %4, %4, -128 \n\t" "addi %4, %4, -128 \n\t"
"lxvw4x 48, 0, %3 \n\t" "lxvd2x 48, 0, %3 \n\t"
"lxvw4x 49, %5, %3 \n\t" "lxvd2x 49, %5, %3 \n\t"
"lxvw4x 50, %6, %3 \n\t" "lxvd2x 50, %6, %3 \n\t"
"lxvw4x 51, %7, %3 \n\t" "lxvd2x 51, %7, %3 \n\t"
"lxvw4x 0, %8, %3 \n\t" "lxvd2x 0, %8, %3 \n\t"
"lxvw4x 1, %9, %3 \n\t" "lxvd2x 1, %9, %3 \n\t"
"lxvw4x 2, %10, %3 \n\t" "lxvd2x 2, %10, %3 \n\t"
"lxvw4x 3, %11, %3 \n\t" "lxvd2x 3, %11, %3 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"lxvw4x 4, 0, %3 \n\t" "lxvd2x 4, 0, %3 \n\t"
"lxvw4x 5, %5, %3 \n\t" "lxvd2x 5, %5, %3 \n\t"
"lxvw4x 6, %6, %3 \n\t" "lxvd2x 6, %6, %3 \n\t"
"lxvw4x 7, %7, %3 \n\t" "lxvd2x 7, %7, %3 \n\t"
"lxvw4x 8, %8, %3 \n\t" "lxvd2x 8, %8, %3 \n\t"
"lxvw4x 9, %9, %3 \n\t" "lxvd2x 9, %9, %3 \n\t"
"lxvw4x 10, %10, %3 \n\t" "lxvd2x 10, %10, %3 \n\t"
"lxvw4x 11, %11, %3 \n\t" "lxvd2x 11, %11, %3 \n\t"
"addi %3, %3, -128 \n\t" "addi %3, %3, -128 \n\t"
"stxvw4x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t" "stxvd2x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t" "stxvd2x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t" "stxvd2x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t" "stxvd2x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t" "stxvd2x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t" "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvw4x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t" "stxvd2x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t" "stxvd2x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t" "stxvd2x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t" "stxvd2x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t" "stxvd2x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n\t" "stxvd2x 47, %11, %3 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvw4x 48, 0, %4 \n\t" "stxvd2x 48, 0, %4 \n\t"
"stxvw4x 49, %5, %4 \n\t" "stxvd2x 49, %5, %4 \n\t"
"stxvw4x 50, %6, %4 \n\t" "stxvd2x 50, %6, %4 \n\t"
"stxvw4x 51, %7, %4 \n\t" "stxvd2x 51, %7, %4 \n\t"
"stxvw4x 0, %8, %4 \n\t" "stxvd2x 0, %8, %4 \n\t"
"stxvw4x 1, %9, %4 \n\t" "stxvd2x 1, %9, %4 \n\t"
"stxvw4x 2, %10, %4 \n\t" "stxvd2x 2, %10, %4 \n\t"
"stxvw4x 3, %11, %4 \n\t" "stxvd2x 3, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"stxvw4x 4, 0, %4 \n\t" "stxvd2x 4, 0, %4 \n\t"
"stxvw4x 5, %5, %4 \n\t" "stxvd2x 5, %5, %4 \n\t"
"stxvw4x 6, %6, %4 \n\t" "stxvd2x 6, %6, %4 \n\t"
"stxvw4x 7, %7, %4 \n\t" "stxvd2x 7, %7, %4 \n\t"
"stxvw4x 8, %8, %4 \n\t" "stxvd2x 8, %8, %4 \n\t"
"stxvw4x 9, %9, %4 \n\t" "stxvd2x 9, %9, %4 \n\t"
"stxvw4x 10, %10, %4 \n\t" "stxvd2x 10, %10, %4 \n\t"
"stxvw4x 11, %11, %4 \n\t" "stxvd2x 11, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"

View File

@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
"xxlxor 38, 38, 38 \n\t" "xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t" "xxlxor 39, 39, 39 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"lxvw4x 42, %9, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"lxvw4x 44, %11, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"lxvw4x 46, %13, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
"xvabssp 50, 42 \n\t" "xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t" "xvabssp 51, 43 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %8, %2 \n\t" "lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t" "xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t" "xvabssp %x4, 45 \n\t"
"lxvw4x 42, %9, %2 \n\t" "lxvd2x 42, %9, %2 \n\t"
"lxvw4x 43, %10, %2 \n\t" "lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t" "xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t" "xvabssp %x6, 47 \n\t"
"lxvw4x 44, %11, %2 \n\t" "lxvd2x 44, %11, %2 \n\t"
"lxvw4x 45, %12, %2 \n\t" "lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t" "xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t" "xvaddsp 33, 33, 49 \n\t"
"lxvw4x 46, %13, %2 \n\t" "lxvd2x 46, %13, %2 \n\t"
"lxvw4x 47, %14, %2 \n\t" "lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t" "xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t" "xvaddsp 35, 35, 51 \n\t"

View File

@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
{ {
__asm__ __asm__
( (
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvd2x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvd2x 43, %7, %2 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvd2x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvd2x 45, %9, %2 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvd2x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"stxvw4x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 41, %5, %2 \n\t" "lxvd2x 41, %5, %2 \n\t"
"stxvw4x 42, %6, %3 \n\t" "stxvd2x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t" "stxvd2x 43, %7, %3 \n\t"
"lxvw4x 42, %6, %2 \n\t" "lxvd2x 42, %6, %2 \n\t"
"lxvw4x 43, %7, %2 \n\t" "lxvd2x 43, %7, %2 \n\t"
"stxvw4x 44, %8, %3 \n\t" "stxvd2x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t" "stxvd2x 45, %9, %3 \n\t"
"lxvw4x 44, %8, %2 \n\t" "lxvd2x 44, %8, %2 \n\t"
"lxvw4x 45, %9, %2 \n\t" "lxvd2x 45, %9, %2 \n\t"
"stxvw4x 46, %10, %3 \n\t" "stxvd2x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n\t" "stxvd2x 47, %11, %3 \n\t"
"lxvw4x 46, %10, %2 \n\t" "lxvd2x 46, %10, %2 \n\t"
"lxvw4x 47, %11, %2 \n\t" "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"2: \n\t" "2: \n\t"
"stxvw4x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvw4x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"
"stxvw4x 42, %6, %3 \n\t" "stxvd2x 42, %6, %3 \n\t"
"stxvw4x 43, %7, %3 \n\t" "stxvd2x 43, %7, %3 \n\t"
"stxvw4x 44, %8, %3 \n\t" "stxvd2x 44, %8, %3 \n\t"
"stxvw4x 45, %9, %3 \n\t" "stxvd2x 45, %9, %3 \n\t"
"stxvw4x 46, %10, %3 \n\t" "stxvd2x 46, %10, %3 \n\t"
"stxvw4x 47, %11, %3 \n" "stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

View File

@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"xxlxor 38, 38, 38 \n\t" "xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t" "xxlxor 39, 39, 39 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t" "lxvd2x 48, 0, %3 \n\t"
"lxvw4x 41, %10, %2 \n\t" "lxvd2x 41, %10, %2 \n\t"
"lxvw4x 49, %10, %3 \n\t" "lxvd2x 49, %10, %3 \n\t"
"lxvw4x 42, %11, %2 \n\t" "lxvd2x 42, %11, %2 \n\t"
"lxvw4x 50, %11, %3 \n\t" "lxvd2x 50, %11, %3 \n\t"
"lxvw4x 43, %12, %2 \n\t" "lxvd2x 43, %12, %2 \n\t"
"lxvw4x 51, %12, %3 \n\t" "lxvd2x 51, %12, %3 \n\t"
"lxvw4x 44, %13, %2 \n\t" "lxvd2x 44, %13, %2 \n\t"
"lxvw4x %x4, %13, %3 \n\t" "lxvd2x %x4, %13, %3 \n\t"
"lxvw4x 45, %14, %2 \n\t" "lxvd2x 45, %14, %2 \n\t"
"lxvw4x %x5, %14, %3 \n\t" "lxvd2x %x5, %14, %3 \n\t"
"lxvw4x 46, %15, %2 \n\t" "lxvd2x 46, %15, %2 \n\t"
"lxvw4x %x6, %15, %3 \n\t" "lxvd2x %x6, %15, %3 \n\t"
"lxvw4x 47, %16, %2 \n\t" "lxvd2x 47, %16, %2 \n\t"
"lxvw4x %x7, %16, %3 \n\t" "lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"1: \n\t" "1: \n\t"
"xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 32, 40, 48 \n\t"
"lxvw4x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
"lxvw4x 48, 0, %3 \n\t" "lxvd2x 48, 0, %3 \n\t"
"xvmaddasp 33, 41, 49 \n\t" "xvmaddasp 33, 41, 49 \n\t"
"lxvw4x 41, %10, %2 \n\t" "lxvd2x 41, %10, %2 \n\t"
"lxvw4x 49, %10, %3 \n\t" "lxvd2x 49, %10, %3 \n\t"
"xvmaddasp 34, 42, 50 \n\t" "xvmaddasp 34, 42, 50 \n\t"
"lxvw4x 42, %11, %2 \n\t" "lxvd2x 42, %11, %2 \n\t"
"lxvw4x 50, %11, %3 \n\t" "lxvd2x 50, %11, %3 \n\t"
"xvmaddasp 35, 43, 51 \n\t" "xvmaddasp 35, 43, 51 \n\t"
"lxvw4x 43, %12, %2 \n\t" "lxvd2x 43, %12, %2 \n\t"
"lxvw4x 51, %12, %3 \n\t" "lxvd2x 51, %12, %3 \n\t"
"xvmaddasp 36, 44, %x4 \n\t" "xvmaddasp 36, 44, %x4 \n\t"
"lxvw4x 44, %13, %2 \n\t" "lxvd2x 44, %13, %2 \n\t"
"lxvw4x %x4, %13, %3 \n\t" "lxvd2x %x4, %13, %3 \n\t"
"xvmaddasp 37, 45, %x5 \n\t" "xvmaddasp 37, 45, %x5 \n\t"
"lxvw4x 45, %14, %2 \n\t" "lxvd2x 45, %14, %2 \n\t"
"lxvw4x %x5, %14, %3 \n\t" "lxvd2x %x5, %14, %3 \n\t"
"xvmaddasp 38, 46, %x6 \n\t" "xvmaddasp 38, 46, %x6 \n\t"
"lxvw4x 46, %15, %2 \n\t" "lxvd2x 46, %15, %2 \n\t"
"lxvw4x %x6, %15, %3 \n\t" "lxvd2x %x6, %15, %3 \n\t"
"xvmaddasp 39, 47, %x7 \n\t" "xvmaddasp 39, 47, %x7 \n\t"
"lxvw4x 47, %16, %2 \n\t" "lxvd2x 47, %16, %2 \n\t"
"lxvw4x %x7, %16, %3 \n\t" "lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"

View File

@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xscvdpspn 37, %x14 \n\t" // load s to all words "xscvdpspn 37, %x14 \n\t" // load s to all words
"xxspltw 37, 37, 0 \n\t" "xxspltw 37, 37, 0 \n\t"
"lxvw4x 32, 0, %3 \n\t" // load x "lxvd2x 32, 0, %3 \n\t" // load x
"lxvw4x 33, %15, %3 \n\t" "lxvd2x 33, %15, %3 \n\t"
"lxvw4x 34, %16, %3 \n\t" "lxvd2x 34, %16, %3 \n\t"
"lxvw4x 35, %17, %3 \n\t" "lxvd2x 35, %17, %3 \n\t"
"lxvw4x 48, 0, %4 \n\t" // load y "lxvd2x 48, 0, %4 \n\t" // load y
"lxvw4x 49, %15, %4 \n\t" "lxvd2x 49, %15, %4 \n\t"
"lxvw4x 50, %16, %4 \n\t" "lxvd2x 50, %16, %4 \n\t"
"lxvw4x 51, %17, %4 \n\t" "lxvd2x 51, %17, %4 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addi %4, %4, 64 \n\t" "addi %4, %4, 64 \n\t"
@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 45, 33, 37 \n\t" "xvmulsp 45, 33, 37 \n\t"
"lxvw4x 32, 0, %3 \n\t" // load x "lxvd2x 32, 0, %3 \n\t" // load x
"lxvw4x 33, %15, %3 \n\t" "lxvd2x 33, %15, %3 \n\t"
"xvmulsp 46, 34, 37 \n\t" "xvmulsp 46, 34, 37 \n\t"
"xvmulsp 47, 35, 37 \n\t" "xvmulsp 47, 35, 37 \n\t"
"lxvw4x 34, %16, %3 \n\t" "lxvd2x 34, %16, %3 \n\t"
"lxvw4x 35, %17, %3 \n\t" "lxvd2x 35, %17, %3 \n\t"
"xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x9, 48, 37 \n\t" // s * y
"xvmulsp %x10, 49, 37 \n\t" "xvmulsp %x10, 49, 37 \n\t"
"lxvw4x 48, 0, %4 \n\t" // load y "lxvd2x 48, 0, %4 \n\t" // load y
"lxvw4x 49, %15, %4 \n\t" "lxvd2x 49, %15, %4 \n\t"
"xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x11, 50, 37 \n\t"
"xvmulsp %x12, 51, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t"
"lxvw4x 50, %16, %4 \n\t" "lxvd2x 50, %16, %4 \n\t"
"lxvw4x 51, %17, %4 \n\t" "lxvd2x 51, %17, %4 \n\t"
"xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
"xvaddsp 41, 41, %x10 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvw4x 40, 0, %3 \n\t" // store x "stxvd2x 40, 0, %3 \n\t" // store x
"stxvw4x 41, %15, %3 \n\t" "stxvd2x 41, %15, %3 \n\t"
"stxvw4x 42, %16, %3 \n\t" "stxvd2x 42, %16, %3 \n\t"
"stxvw4x 43, %17, %3 \n\t" "stxvd2x 43, %17, %3 \n\t"
"stxvw4x %x5, 0, %4 \n\t" // store y "stxvd2x %x5, 0, %4 \n\t" // store y
"stxvw4x %x6, %15, %4 \n\t" "stxvd2x %x6, %15, %4 \n\t"
"stxvw4x %x7, %16, %4 \n\t" "stxvd2x %x7, %16, %4 \n\t"
"stxvw4x %x8, %17, %4 \n\t" "stxvd2x %x8, %17, %4 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
"stxvw4x 40, 0, %3 \n\t" // store x "stxvd2x 40, 0, %3 \n\t" // store x
"stxvw4x 41, %15, %3 \n\t" "stxvd2x 41, %15, %3 \n\t"
"stxvw4x 42, %16, %3 \n\t" "stxvd2x 42, %16, %3 \n\t"
"stxvw4x 43, %17, %3 \n\t" "stxvd2x 43, %17, %3 \n\t"
"stxvw4x %x5, 0, %4 \n\t" // store y "stxvd2x %x5, 0, %4 \n\t" // store y
"stxvw4x %x6, %15, %4 \n\t" "stxvd2x %x6, %15, %4 \n\t"
"stxvw4x %x7, %16, %4 \n\t" "stxvd2x %x7, %16, %4 \n\t"
"stxvw4x %x8, %17, %4 \n" "stxvd2x %x8, %17, %4 \n"
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"

View File

@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xscvdpspn %x3, %x3 \n\t" "xscvdpspn %x3, %x3 \n\t"
"xxspltw %x3, %x3, 0 \n\t" "xxspltw %x3, %x3, 0 \n\t"
"lxvw4x 32, 0, %2 \n\t" "lxvd2x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t" "lxvd2x 33, %4, %2 \n\t"
"lxvw4x 34, %5, %2 \n\t" "lxvd2x 34, %5, %2 \n\t"
"lxvw4x 35, %6, %2 \n\t" "lxvd2x 35, %6, %2 \n\t"
"lxvw4x 36, %7, %2 \n\t" "lxvd2x 36, %7, %2 \n\t"
"lxvw4x 37, %8, %2 \n\t" "lxvd2x 37, %8, %2 \n\t"
"lxvw4x 38, %9, %2 \n\t" "lxvd2x 38, %9, %2 \n\t"
"lxvw4x 39, %10, %2 \n\t" "lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 40, 32, %x3 \n\t" "xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t"
"lxvw4x 32, 0, %2 \n\t" "lxvd2x 32, 0, %2 \n\t"
"lxvw4x 33, %4, %2 \n\t" "lxvd2x 33, %4, %2 \n\t"
"xvmulsp 42, 34, %x3 \n\t" "xvmulsp 42, 34, %x3 \n\t"
"xvmulsp 43, 35, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t"
"lxvw4x 34, %5, %2 \n\t" "lxvd2x 34, %5, %2 \n\t"
"lxvw4x 35, %6, %2 \n\t" "lxvd2x 35, %6, %2 \n\t"
"xvmulsp 44, 36, %x3 \n\t" "xvmulsp 44, 36, %x3 \n\t"
"xvmulsp 45, 37, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t"
"lxvw4x 36, %7, %2 \n\t" "lxvd2x 36, %7, %2 \n\t"
"lxvw4x 37, %8, %2 \n\t" "lxvd2x 37, %8, %2 \n\t"
"xvmulsp 46, 38, %x3 \n\t" "xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t"
"lxvw4x 38, %9, %2 \n\t" "lxvd2x 38, %9, %2 \n\t"
"lxvw4x 39, %10, %2 \n\t" "lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, -128 \n\t" "addi %2, %2, -128 \n\t"
"stxvw4x 40, 0, %2 \n\t" "stxvd2x 40, 0, %2 \n\t"
"stxvw4x 41, %4, %2 \n\t" "stxvd2x 41, %4, %2 \n\t"
"stxvw4x 42, %5, %2 \n\t" "stxvd2x 42, %5, %2 \n\t"
"stxvw4x 43, %6, %2 \n\t" "stxvd2x 43, %6, %2 \n\t"
"stxvw4x 44, %7, %2 \n\t" "stxvd2x 44, %7, %2 \n\t"
"stxvw4x 45, %8, %2 \n\t" "stxvd2x 45, %8, %2 \n\t"
"stxvw4x 46, %9, %2 \n\t" "stxvd2x 46, %9, %2 \n\t"
"stxvw4x 47, %10, %2 \n\t" "stxvd2x 47, %10, %2 \n\t"
"addi %2, %2, 256 \n\t" "addi %2, %2, 256 \n\t"
@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 46, 38, %x3 \n\t" "xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t"
"stxvw4x 40, 0, %2 \n\t" "stxvd2x 40, 0, %2 \n\t"
"stxvw4x 41, %4, %2 \n\t" "stxvd2x 41, %4, %2 \n\t"
"stxvw4x 42, %5, %2 \n\t" "stxvd2x 42, %5, %2 \n\t"
"stxvw4x 43, %6, %2 \n\t" "stxvd2x 43, %6, %2 \n\t"
"stxvw4x 44, %7, %2 \n\t" "stxvd2x 44, %7, %2 \n\t"
"stxvw4x 45, %8, %2 \n\t" "stxvd2x 45, %8, %2 \n\t"
"stxvw4x 46, %9, %2 \n\t" "stxvd2x 46, %9, %2 \n\t"
"stxvw4x 47, %10, %2 \n" "stxvd2x 47, %10, %2 \n"
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
: :
@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
".p2align 5 \n" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"stxvw4x %x3, 0, %2 \n\t" "stxvd2x %x3, 0, %2 \n\t"
"stxvw4x %x3, %4, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t"
"stxvw4x %x3, %5, %2 \n\t" "stxvd2x %x3, %5, %2 \n\t"
"stxvw4x %x3, %6, %2 \n\t" "stxvd2x %x3, %6, %2 \n\t"
"stxvw4x %x3, %7, %2 \n\t" "stxvd2x %x3, %7, %2 \n\t"
"stxvw4x %x3, %8, %2 \n\t" "stxvd2x %x3, %8, %2 \n\t"
"stxvw4x %x3, %9, %2 \n\t" "stxvd2x %x3, %9, %2 \n\t"
"stxvw4x %x3, %10, %2 \n\t" "stxvd2x %x3, %10, %2 \n\t"
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"

View File

@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n" ".p2align 5 \n"
"1: \n\t" "1: \n\t"
"lxvw4x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvw4x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
"lxvw4x 34, %6, %4 \n\t" "lxvd2x 34, %6, %4 \n\t"
"lxvw4x 35, %7, %4 \n\t" "lxvd2x 35, %7, %4 \n\t"
"lxvw4x 36, %8, %4 \n\t" "lxvd2x 36, %8, %4 \n\t"
"lxvw4x 37, %9, %4 \n\t" "lxvd2x 37, %9, %4 \n\t"
"lxvw4x 38, %10, %4 \n\t" "lxvd2x 38, %10, %4 \n\t"
"lxvw4x 39, %11, %4 \n\t" "lxvd2x 39, %11, %4 \n\t"
"lxvw4x 40, 0, %3 \n\t" "lxvd2x 40, 0, %3 \n\t"
"lxvw4x 41, %5, %3 \n\t" "lxvd2x 41, %5, %3 \n\t"
"lxvw4x 42, %6, %3 \n\t" "lxvd2x 42, %6, %3 \n\t"
"lxvw4x 43, %7, %3 \n\t" "lxvd2x 43, %7, %3 \n\t"
"lxvw4x 44, %8, %3 \n\t" "lxvd2x 44, %8, %3 \n\t"
"lxvw4x 45, %9, %3 \n\t" "lxvd2x 45, %9, %3 \n\t"
"lxvw4x 46, %10, %3 \n\t" "lxvd2x 46, %10, %3 \n\t"
"lxvw4x 47, %11, %3 \n\t" "lxvd2x 47, %11, %3 \n\t"
"stxvw4x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvw4x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
"stxvw4x 34, %6, %3 \n\t" "stxvd2x 34, %6, %3 \n\t"
"stxvw4x 35, %7, %3 \n\t" "stxvd2x 35, %7, %3 \n\t"
"stxvw4x 36, %8, %3 \n\t" "stxvd2x 36, %8, %3 \n\t"
"stxvw4x 37, %9, %3 \n\t" "stxvd2x 37, %9, %3 \n\t"
"stxvw4x 38, %10, %3 \n\t" "stxvd2x 38, %10, %3 \n\t"
"stxvw4x 39, %11, %3 \n\t" "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"stxvw4x 40, 0, %4 \n\t" "stxvd2x 40, 0, %4 \n\t"
"stxvw4x 41, %5, %4 \n\t" "stxvd2x 41, %5, %4 \n\t"
"stxvw4x 42, %6, %4 \n\t" "stxvd2x 42, %6, %4 \n\t"
"stxvw4x 43, %7, %4 \n\t" "stxvd2x 43, %7, %4 \n\t"
"stxvw4x 44, %8, %4 \n\t" "stxvd2x 44, %8, %4 \n\t"
"stxvw4x 45, %9, %4 \n\t" "stxvd2x 45, %9, %4 \n\t"
"stxvw4x 46, %10, %4 \n\t" "stxvd2x 46, %10, %4 \n\t"
"stxvw4x 47, %11, %4 \n\t" "stxvd2x 47, %11, %4 \n\t"
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"

View File

@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
target_link_libraries(${OpenBLAS_utest_bin} m) target_link_libraries(${OpenBLAS_utest_bin} m)
endif() endif()
if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS")
endif()
#Set output for utest #Set output for utest
set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})