Merge branch 'develop' into relapack
This commit is contained in:
commit
aaa65e06f1
|
@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME}
|
|||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
||||
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
|
||||
)
|
||||
ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h)
|
||||
|
||||
ADD_CUSTOM_TARGET(genconfig
|
||||
ALL
|
||||
DEPENDS openblas_config.h
|
||||
)
|
||||
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME}
|
|||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
ADD_CUSTOM_TARGET(genf77blas
|
||||
ALL
|
||||
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
|
||||
)
|
||||
|
@ -255,11 +260,11 @@ if(NOT NO_CBLAS)
|
|||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
|
||||
ADD_CUSTOM_TARGET(gencblas
|
||||
ALL
|
||||
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
|
||||
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
|
||||
)
|
||||
|
||||
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
|
||||
|
||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
|
23
Makefile.arm
23
Makefile.arm
|
@ -1,5 +1,4 @@
|
|||
#ifeq logical or
|
||||
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
|
||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
|
@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV7)
|
||||
ifeq ($(OSNAME), Android)
|
||||
ifeq ($(ARM_SOFTFP_ABI), 1)
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV6)
|
||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(CORE), ARMV5)
|
||||
CCOMMON_OPT += -marm -march=armv5
|
||||
FCOMMON_OPT += -marm -march=armv5
|
||||
CCOMMON_OPT += -march=armv5
|
||||
FCOMMON_OPT += -march=armv5
|
||||
endif
|
||||
|
|
|
@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
||||
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
||||
endif
|
||||
|
|
|
@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF}
|
|||
"#define __${BINARY}BIT__\t1\n"
|
||||
"#define FUNDERSCORE\t${FU}\n")
|
||||
|
||||
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
|
||||
file(APPEND ${TARGET_CONF}
|
||||
"#define OS_WINNT\t1\n")
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ if (CYGWIN)
|
|||
set(NO_EXPRECISION 1)
|
||||
endif ()
|
||||
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
|
||||
if (SMP)
|
||||
set(EXTRALIB "${EXTRALIB} -lpthread")
|
||||
endif ()
|
||||
|
|
|
@ -72,20 +72,26 @@ if (MSVC)
|
|||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
endif()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
# disable WindowsStore strict CRT checks
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
|
||||
endif ()
|
||||
|
||||
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
|
||||
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY ${GETARCH_DIR})
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
|
||||
if (NOT ${GETARCH_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
message(STATUS "Running getarch")
|
||||
|
||||
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
|
||||
|
@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
|
|||
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
|
||||
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY ${GETARCH2_DIR})
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH2_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GETARCH2_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH2_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
|
||||
if (NOT ${GETARCH2_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
|
||||
|
@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
|
|||
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
|
||||
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
|
||||
|
||||
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
|
||||
)
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GEN_CONFIG_H_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
|
||||
endif ()
|
||||
if (NOT ${GEN_CONFIG_H_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
|
||||
endif ()
|
||||
endif ()
|
13
common.h
13
common.h
|
@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) 0
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||
|
@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
|
|||
typedef char* env_var_t;
|
||||
#define readenv(p, n) ((p)=getenv(n))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
|
||||
#ifdef _POSIX_MONOTONIC_CLOCK
|
||||
|
@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
|
|||
*address = 0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
static __inline int readenv_atoi(char *env) {
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#ifdef OS_WINDOWS
|
||||
static __inline int readenv_atoi(char *env) {
|
||||
env_var_t p;
|
||||
|
@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
|
|||
return(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
|
||||
|
||||
|
|
|
@ -111,11 +111,6 @@ REALNAME:
|
|||
|
||||
#define PROFCODE
|
||||
|
||||
#ifdef __ARM_PCS
|
||||
//-mfloat-abi=softfp
|
||||
#define SOFT_FLOAT_ABI
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
|
|||
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
|
|
@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
|||
#endif
|
||||
|
||||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
blas_queue_t queue[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
|
||||
|
|
|
@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
|||
blas_arg_t args;
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
BLASLONG range_m[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER];
|
||||
BLASLONG range_n[MAX_CPU_NUMBER + 1];
|
||||
|
||||
BLASLONG width, i, num_cpu;
|
||||
|
||||
|
|
|
@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)i;
|
||||
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
|
||||
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
|
@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
if (nthreads - num_cpu > 1) {
|
||||
|
||||
di = (double)(arg -> n - i);
|
||||
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
|
||||
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
|
||||
|
||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||
|
||||
|
|
|
@ -12,6 +12,8 @@ if (SMP)
|
|||
set(BLAS_SERVER blas_server_omp.c)
|
||||
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||
set(BLAS_SERVER blas_server_win32.c)
|
||||
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
|
||||
set(BLAS_SERVER blas_server_win32.c)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED BLAS_SERVER)
|
||||
|
|
|
@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
SetEvent(pool.killed);
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
TerminateThread(blas_threads[i],0);
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
#endif
|
||||
}
|
||||
|
||||
blas_server_avail = 0;
|
||||
|
|
|
@ -354,6 +354,24 @@ static int numa_check(void) {
|
|||
return common -> num_nodes;
|
||||
}
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int sched_getcpu(void)
|
||||
{
|
||||
int cpu;
|
||||
FILE *fp = NULL;
|
||||
if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
|
||||
return -1;
|
||||
if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
|
||||
fclose (fp);
|
||||
return -1;
|
||||
}
|
||||
fclose (fp);
|
||||
return(cpu);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static void numa_mapping(void) {
|
||||
|
||||
int node, cpu, core;
|
||||
|
@ -808,7 +826,6 @@ void gotoblas_affinity_init(void) {
|
|||
common -> shmid = pshmid;
|
||||
|
||||
if (common -> magic != SH_MAGIC) {
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "Shared Memory Initialization.\n");
|
||||
#endif
|
||||
|
@ -830,7 +847,7 @@ void gotoblas_affinity_init(void) {
|
|||
if (common -> num_nodes > 1) numa_mapping();
|
||||
|
||||
common -> final_num_procs = 0;
|
||||
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
|
||||
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
|
||||
|
||||
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV5
|
||||
|
||||
|
||||
|
||||
###############################################################################
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
DAMAXKERNEL = iamax_vfp.S
|
||||
CAMAXKERNEL = iamax_vfp.S
|
||||
|
@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S
|
|||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
|
@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S
|
|||
CNRM2KERNEL = nrm2_vfp.S
|
||||
ZNRM2KERNEL = nrm2_vfp.S
|
||||
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SSCALKERNEL = scal.c
|
||||
DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SSWAPKERNEL = swap_vfp.S
|
||||
DSWAPKERNEL = swap_vfp.S
|
||||
CSWAPKERNEL = swap_vfp.S
|
||||
|
@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
|
|||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x2_vfp.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
|
||||
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
endif
|
||||
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
|
||||
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
endif
|
||||
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
|
@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
|
|||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x2_vfp.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
|
||||
|
||||
|
|
|
@ -1,91 +1,12 @@
|
|||
|
||||
#################################################################################
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
DAMAXKERNEL = iamax_vfp.S
|
||||
CAMAXKERNEL = iamax_vfp.S
|
||||
ZAMAXKERNEL = iamax_vfp.S
|
||||
|
||||
SAMINKERNEL = iamax_vfp.S
|
||||
DAMINKERNEL = iamax_vfp.S
|
||||
CAMINKERNEL = iamax_vfp.S
|
||||
ZAMINKERNEL = iamax_vfp.S
|
||||
|
||||
SMAXKERNEL = iamax_vfp.S
|
||||
DMAXKERNEL = iamax_vfp.S
|
||||
|
||||
SMINKERNEL = iamax_vfp.S
|
||||
DMINKERNEL = iamax_vfp.S
|
||||
|
||||
ISAMAXKERNEL = iamax_vfp.S
|
||||
IDAMAXKERNEL = iamax_vfp.S
|
||||
ICAMAXKERNEL = iamax_vfp.S
|
||||
IZAMAXKERNEL = iamax_vfp.S
|
||||
|
||||
ISAMINKERNEL = iamax_vfp.S
|
||||
IDAMINKERNEL = iamax_vfp.S
|
||||
ICAMINKERNEL = iamax_vfp.S
|
||||
IZAMINKERNEL = iamax_vfp.S
|
||||
|
||||
ISMAXKERNEL = iamax_vfp.S
|
||||
IDMAXKERNEL = iamax_vfp.S
|
||||
|
||||
ISMINKERNEL = iamax_vfp.S
|
||||
IDMINKERNEL = iamax_vfp.S
|
||||
|
||||
SSWAPKERNEL = swap_vfp.S
|
||||
DSWAPKERNEL = swap_vfp.S
|
||||
CSWAPKERNEL = swap_vfp.S
|
||||
ZSWAPKERNEL = swap_vfp.S
|
||||
|
||||
SASUMKERNEL = asum_vfp.S
|
||||
DASUMKERNEL = asum_vfp.S
|
||||
CASUMKERNEL = asum_vfp.S
|
||||
ZASUMKERNEL = asum_vfp.S
|
||||
|
||||
SAXPYKERNEL = axpy_vfp.S
|
||||
DAXPYKERNEL = axpy_vfp.S
|
||||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
CDOTKERNEL = cdot_vfp.S
|
||||
ZDOTKERNEL = zdot_vfp.S
|
||||
include $(KERNELDIR)/KERNEL.ARMV6
|
||||
|
||||
SNRM2KERNEL = nrm2_vfpv3.S
|
||||
DNRM2KERNEL = nrm2_vfpv3.S
|
||||
CNRM2KERNEL = nrm2_vfpv3.S
|
||||
ZNRM2KERNEL = nrm2_vfpv3.S
|
||||
|
||||
SROTKERNEL = rot_vfp.S
|
||||
DROTKERNEL = rot_vfp.S
|
||||
CROTKERNEL = rot_vfp.S
|
||||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SSCALKERNEL = scal.c
|
||||
DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
|
||||
SGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
DGEMVNKERNEL = gemv_n_vfpv3.S
|
||||
CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
ZGEMVNKERNEL = zgemv_n_vfp.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t_vfp.S
|
||||
DGEMVTKERNEL = gemv_t_vfp.S
|
||||
CGEMVTKERNEL = cgemv_t_vfp.S
|
||||
ZGEMVTKERNEL = zgemv_t_vfp.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
|
||||
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
|
||||
|
@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
|
|||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
|
||||
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
|
||||
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
|
||||
|
||||
|
|
|
@ -475,6 +475,14 @@ asum_kernel_L999:
|
|||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov r0, s0
|
||||
#else
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bx lr
|
||||
|
||||
EPILOGUE
|
||||
|
|
|
@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
//hard
|
||||
#define OLD_INC_X [fp, #0 ]
|
||||
#define OLD_Y [fp, #4 ]
|
||||
#define OLD_INC_Y [fp, #8 ]
|
||||
#else
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_X [fp, #0 ]
|
||||
#define OLD_INC_X [fp, #4 ]
|
||||
#define OLD_Y [fp, #8 ]
|
||||
#define OLD_INC_Y [fp, #12 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0]
|
||||
#define OLD_X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define OLD_Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#endif
|
||||
|
||||
|
||||
#else //COMPLEX
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define OLD_Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#else
|
||||
#define OLD_ALPHAR [fp, #0]
|
||||
#define OLD_ALPHAI [fp, #8]
|
||||
#define OLD_X [fp, #16 ]
|
||||
#define OLD_INC_X [fp, #20 ]
|
||||
#define OLD_Y [fp, #24 ]
|
||||
#define OLD_INC_Y [fp, #28 ]
|
||||
#endif
|
||||
|
||||
#endif //!defined(COMPLEX)
|
||||
|
||||
#else //__ARM_PCS_VFP
|
||||
|
||||
#define OLD_INC_X [fp, #0 ]
|
||||
#define OLD_Y [fp, #4 ]
|
||||
#define OLD_INC_Y [fp, #8 ]
|
||||
|
||||
#endif //!defined(__ARM_PCS_VFP)
|
||||
|
||||
#define N r0
|
||||
#define Y r1
|
||||
#define INC_X r2
|
||||
|
@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(DOUBLE)
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
|
@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #8
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
#ifndef DOUBLE
|
||||
vmov s0, r3 //move alpha to s0
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
ldr X, OLD_X
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
#else //COMPLEX
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr X, OLD_X
|
||||
#else
|
||||
vldr d0, OLD_ALPHAR
|
||||
vldr d1, OLD_ALPHAI
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
ldr INC_X , OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
|
|
@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
#define OLD_Y r3
|
||||
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
|
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
* registers
|
||||
*******************************************************/
|
||||
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_RETURN_ADDR r0
|
||||
#define OLD_N r1
|
||||
#define OLD_X r2
|
||||
#define OLD_INC_X r3
|
||||
#define OLD_Y [fp, #0 ]
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#define RETURN_ADDR r8
|
||||
#else
|
||||
#define OLD_Y r3
|
||||
#define OLD_INC_Y [fp, #0 ]
|
||||
#endif
|
||||
|
||||
#define I r5
|
||||
#define Y r6
|
||||
|
@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 5
|
||||
|
||||
push {r4 - r9, fp}
|
||||
add fp, sp, #24
|
||||
add fp, sp, #28
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
sub r4, fp, #128
|
||||
|
@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmov s2, s0
|
||||
vmov s3, s0
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
mov RETURN_ADDR, OLD_RETURN_ADDR
|
||||
mov N, OLD_N
|
||||
mov X, OLD_X
|
||||
mov INC_X, OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#else
|
||||
mov Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble cdot_kernel_L999
|
||||
|
@ -265,7 +283,6 @@ cdot_kernel_S10:
|
|||
|
||||
|
||||
cdot_kernel_L999:
|
||||
|
||||
sub r3, fp, #128
|
||||
vldm r3, { s8 - s15} // restore floating point registers
|
||||
|
||||
|
@ -276,8 +293,11 @@ cdot_kernel_L999:
|
|||
vadd.f32 s0 , s0, s2
|
||||
vsub.f32 s1 , s1, s3
|
||||
#endif
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vstm RETURN_ADDR, {s0 - s1}
|
||||
#endif
|
||||
|
||||
sub sp, fp, #24
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
|
|
|
@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R1 vmls.f32
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
|
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
|
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R1 vmls.f32
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define OLD_LDA [fp, #8 ]
|
||||
#define X [fp, #12 ]
|
||||
#define OLD_INC_X [fp, #16 ]
|
||||
#define Y [fp, #20 ]
|
||||
#define OLD_INC_Y [fp, #24 ]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
|
@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp N, #0
|
||||
ble cgemvn_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
vstr s0 , ALPHA_R
|
||||
|
|
|
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR r3
|
||||
#define OLD_ALPHAI [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define OLD_LDA [fp, #8 ]
|
||||
#define X [fp, #12 ]
|
||||
#define OLD_INC_X [fp, #16 ]
|
||||
#define Y [fp, #20 ]
|
||||
#define OLD_INC_Y [fp, #24 ]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
|
@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp OLD_N, #0
|
||||
ble cgemvt_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov s0, OLD_ALPHAR
|
||||
vldr s1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_N, N
|
||||
|
||||
|
|
|
@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#define OFFSET [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacs
|
||||
#define KMAC_I fnmacs
|
||||
#define KMAC_I vmls.f32
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacs
|
||||
#define KMAC_R vmls.f32
|
||||
#define KMAC_I fmacs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I1 vmls.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#endif
|
||||
|
@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP r3
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define B [fp, #12 ]
|
||||
#define C [fp, #16 ]
|
||||
#define OLD_LDC [fp, #20 ]
|
||||
#define OFFSET [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmuls
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R1 vnmul.f32
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmuls
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
|
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmuls
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmuls
|
||||
#define FMAC_I1 vnmul.f32
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmuls
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_R2 vmls.f32
|
||||
#define FMAC_I1 fmuls
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
|
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmuls
|
||||
#define FMAC_R1 vnmul.f32
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmuls
|
||||
#define FMAC_I2 fnmacs
|
||||
#define FMAC_I1 vnmul.f32
|
||||
#define FMAC_I2 vmls.f32
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -246,6 +246,9 @@ ddot_kernel_L999:
|
|||
vldm r3, { d8 - d15} // restore floating point registers
|
||||
|
||||
vadd.f64 d0 , d0, d1 // set return value
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
sub sp, fp, #24
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
|
|
@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define OLD_C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#define OFFSET [fp, #28 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP [fp, #4]
|
||||
#define OLD_A_SOFTFP [fp, #12 ]
|
||||
#define B [fp, #16 ]
|
||||
#define OLD_C [fp, #20 ]
|
||||
#define OLD_LDC [fp, #24 ]
|
||||
#define OFFSET [fp, #28 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
|
@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp N, #0
|
||||
ble gemvn_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
|
||||
|
|
|
@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
//hard
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#define OLD_A r3
|
||||
#else
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
#define AO1 r0
|
||||
|
@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp N, #0
|
||||
ble gemvn_kernel_L999
|
||||
|
||||
#ifndef DOUBLE
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
|
||||
vmov s0, OLD_ALPHA
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
|
||||
|
||||
|
||||
ldr INC_X , OLD_INC_X
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
||||
|
|
|
@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
//hard abi
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#define OLD_A r3
|
||||
#else
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
#define M r0
|
||||
|
@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp OLD_N, #0
|
||||
ble gemvt_kernel_L999
|
||||
|
||||
#ifndef DOUBLE
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov s0, OLD_ALPHA
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
|
|
|
@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_ALPHA r3
|
||||
#define OLD_A_SOFTFP [fp, #0 ]
|
||||
#define OLD_LDA [fp, #4 ]
|
||||
#define X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12 ]
|
||||
#define Y [fp, #16 ]
|
||||
#define OLD_INC_Y [fp, #20 ]
|
||||
#else
|
||||
#define OLD_ALPHA [fp, #0 ]
|
||||
#define OLD_A_SOFTFP [fp, #8 ]
|
||||
#define OLD_LDA [fp, #12]
|
||||
#define X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
|
@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp OLD_N, #0
|
||||
ble gemvt_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov s0, OLD_ALPHA
|
||||
#else
|
||||
vldr d0, OLD_ALPHA
|
||||
#endif
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_N, N
|
||||
|
||||
|
|
|
@ -573,6 +573,13 @@ nrm2_kernel_L999:
|
|||
#else
|
||||
vsqrt.f32 s1, s1
|
||||
vmul.f32 s0, s0, s1
|
||||
#endif
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov r0, s0
|
||||
#else
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bx lr
|
||||
|
|
|
@ -503,8 +503,13 @@ nrm2_kernel_L999:
|
|||
#else
|
||||
vsqrt.f32 s1, s1
|
||||
vmul.f32 s0, s0, s1
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov r0, s0
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if defined(DOUBLE)
|
||||
vmov r0, r1, d0
|
||||
#else
|
||||
vmov r0, s0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define OLD_INC_Y [fp, #0 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_C [fp, #4]
|
||||
#define OLD_S [fp, #8]
|
||||
#else
|
||||
#define OLD_C [fp, #8]
|
||||
#define OLD_S [fp, #16]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define N r0
|
||||
#define X r1
|
||||
|
@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d5
|
||||
vmul.f64 d3 , d0, d5
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X, { d2 }
|
||||
fstmiad Y, { d3 }
|
||||
|
||||
|
@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s5
|
||||
vmul.f32 s3 , s0, s5
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X, { s2 }
|
||||
fstmias Y, { s3 }
|
||||
|
||||
|
@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
fstmiad X!, { d2 }
|
||||
fstmiad Y!, { d3 }
|
||||
|
||||
|
@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f64 d2 , d0, d4
|
||||
fmacd d2 , d1, d6
|
||||
vmul.f64 d3 , d0, d6
|
||||
fnmacd d3 , d1, d4
|
||||
vmls.f64 d3 , d1, d4
|
||||
vstr d2 , [ X, #0 ]
|
||||
vstr d3 , [ Y, #0 ]
|
||||
vmul.f64 d2 , d0, d5
|
||||
fmacd d2 , d1, d7
|
||||
vmul.f64 d3 , d0, d7
|
||||
fnmacd d3 , d1, d5
|
||||
vmls.f64 d3 , d1, d5
|
||||
vstr d2 , [ X, #8 ]
|
||||
vstr d3 , [ Y, #8 ]
|
||||
|
||||
|
@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
fstmias X!, { s2 }
|
||||
fstmias Y!, { s3 }
|
||||
|
||||
|
@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vmul.f32 s2 , s0, s4
|
||||
fmacs s2 , s1, s6
|
||||
vmul.f32 s3 , s0, s6
|
||||
fnmacs s3 , s1, s4
|
||||
vmls.f32 s3 , s1, s4
|
||||
vstr s2 , [ X, #0 ]
|
||||
vstr s3 , [ Y, #0 ]
|
||||
vmul.f32 s2 , s0, s5
|
||||
fmacs s2 , s1, s7
|
||||
vmul.f32 s3 , s0, s7
|
||||
fnmacs s3 , s1, s5
|
||||
vmls.f32 s3 , s1, s5
|
||||
vstr s2 , [ X, #4 ]
|
||||
vstr s3 , [ Y, #4 ]
|
||||
|
||||
|
@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #8
|
||||
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vldr s0, OLD_C
|
||||
vldr s1, OLD_S
|
||||
#else
|
||||
vldr d0, OLD_C
|
||||
vldr d1, OLD_S
|
||||
#endif
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble rot_kernel_L999
|
||||
|
|
|
@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
|
@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
|
@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X!, { d2 - d3 }
|
||||
|
@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fldmiad X, { d4 - d5 }
|
||||
vmul.f64 d2, d0, d4
|
||||
fnmacd d2, d1, d5
|
||||
vmls.f64 d2, d1, d5
|
||||
vmul.f64 d3, d0, d5
|
||||
fmacd d3, d1, d4
|
||||
fstmiad X, { d2 - d3 }
|
||||
|
@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X!, { s2 - s3 }
|
||||
|
@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
fldmias X, { s4 - s5 }
|
||||
vmul.f32 s2, s0, s4
|
||||
fnmacs s2, s1, s5
|
||||
vmls.f32 s2, s1, s5
|
||||
vmul.f32 s3, s0, s5
|
||||
fmacs s3, s1, s4
|
||||
fstmias X, { s2 - s3 }
|
||||
|
|
|
@ -329,20 +329,19 @@ sdot_kernel_L999:
|
|||
vldm r3, { s8 - s15} // restore floating point registers
|
||||
|
||||
#if defined(DSDOT)
|
||||
|
||||
vadd.f64 d0 , d0, d1 // set return value
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov r0, r1, d0
|
||||
#else
|
||||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if defined(DSDOT)
|
||||
vmov r0, r1, d0
|
||||
#else
|
||||
|
||||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
vmov r0, s0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
sub sp, fp, #24
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
|
|
@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define OLD_M r0
|
||||
#define OLD_N r1
|
||||
#define OLD_K r2
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
#define OLD_ALPHA r3
|
||||
//#define OLD_A
|
||||
#else //hard
|
||||
#define OLD_A r3
|
||||
#define OLD_ALPHA s0
|
||||
#endif
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
|
@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define M [fp, #-256 ]
|
||||
#define N [fp, #-260 ]
|
||||
#define K [fp, #-264 ]
|
||||
|
||||
#ifndef ARM_SOFTFP_ABI
|
||||
#define A [fp, #-268 ]
|
||||
#endif
|
||||
|
||||
#define FP_ZERO [fp, #-240]
|
||||
#define FP_ZERO_0 [fp, #-240]
|
||||
|
@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
#define A [fp, #4 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#else //hard
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
#define L r2
|
||||
|
@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
||||
#ifdef ARM_SOFTFP_ABI
|
||||
str OLD_ALPHA, ALPHA
|
||||
#else //hard
|
||||
str OLD_A, A
|
||||
vstr OLD_ALPHA, ALPHA
|
||||
#endif
|
||||
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s31} // store floating point registers
|
||||
|
||||
|
|
|
@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-276 ]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define OLD_C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#define OFFSET [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define OLD_C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ALPHA [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHA_SOFTFP r3
|
||||
#define OLD_A_SOFTFP [fp, #4 ]
|
||||
#define B [fp, #8 ]
|
||||
#define C [fp, #12 ]
|
||||
#define OLD_LDC [fp, #16 ]
|
||||
#define OFFSET [fp, #20 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_X [fp, #0 ]
|
||||
#define OLD_INC_X [fp, #4 ]
|
||||
#define OLD_Y [fp, #8 ]
|
||||
#define OLD_INC_Y [fp, #12 ]
|
||||
#else
|
||||
#define OLD_X [fp, #8 ]
|
||||
#define OLD_INC_X [fp, #12]
|
||||
#define OLD_Y [fp, #16]
|
||||
#define OLD_INC_Y [fp, #20]
|
||||
#endif
|
||||
|
||||
#else //COMPLEX
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define OLD_X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define OLD_Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#else
|
||||
#define OLD_X [fp, #16]
|
||||
#define OLD_INC_X [fp, #20]
|
||||
#define OLD_Y [fp, #24]
|
||||
#define OLD_INC_Y [fp, #28]
|
||||
#endif
|
||||
|
||||
#endif // !defined(__ARM_PCS_VFP)
|
||||
|
||||
#else
|
||||
#define OLD_INC_X [fp, #0 ]
|
||||
#define OLD_Y [fp, #4 ]
|
||||
#define OLD_INC_Y [fp, #8 ]
|
||||
#endif
|
||||
|
||||
|
||||
#define N r0
|
||||
|
@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
push {r4 , fp}
|
||||
add fp, sp, #8
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
ldr X, OLD_X
|
||||
#endif
|
||||
ldr INC_X , OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y , OLD_INC_Y
|
||||
|
|
|
@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
#define OLD_Y r3
|
||||
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
|
@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
* registers
|
||||
*******************************************************/
|
||||
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_RETURN_ADDR r0
|
||||
#define OLD_N r1
|
||||
#define OLD_X r2
|
||||
#define OLD_INC_X r3
|
||||
#define OLD_Y [fp, #0 ]
|
||||
#define OLD_INC_Y [fp, #4 ]
|
||||
#define RETURN_ADDR r8
|
||||
#else
|
||||
#define OLD_Y r3
|
||||
#define OLD_INC_Y [fp, #0 ]
|
||||
#endif
|
||||
|
||||
#define I r5
|
||||
#define Y r6
|
||||
|
@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 5
|
||||
|
||||
push {r4 - r9, fp}
|
||||
add fp, sp, #24
|
||||
add fp, sp, #28
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
sub r4, fp, #128
|
||||
|
@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vcvt.f64.f32 d2, s0
|
||||
vcvt.f64.f32 d3, s0
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
mov RETURN_ADDR, OLD_RETURN_ADDR
|
||||
mov N, OLD_N
|
||||
mov X, OLD_X
|
||||
mov INC_X, OLD_INC_X
|
||||
ldr Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
#else
|
||||
mov Y, OLD_Y
|
||||
ldr INC_Y, OLD_INC_Y
|
||||
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble zdot_kernel_L999
|
||||
|
@ -280,8 +297,11 @@ zdot_kernel_L999:
|
|||
vadd.f64 d0 , d0, d2
|
||||
vsub.f64 d1 , d1, d3
|
||||
#endif
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vstm RETURN_ADDR, {d0 - d1}
|
||||
#endif
|
||||
|
||||
sub sp, fp, #24
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
|
|
|
@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
|
@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R1 vmls.f64
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
|
@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_I fsubd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
|
@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmacd
|
||||
#define FMAC_R1 vmls.f64
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR [fp, #0 ]
|
||||
#define OLD_ALPHAI [fp, #8 ]
|
||||
#define OLD_A_SOFTFP [fp, #16]
|
||||
#define OLD_LDA [fp, #20]
|
||||
#define X [fp, #24]
|
||||
#define OLD_INC_X [fp, #28]
|
||||
#define Y [fp, #32]
|
||||
#define OLD_INC_Y [fp, #36]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_M r0
|
||||
|
||||
|
@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
|
@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp N, #0
|
||||
ble zgemvn_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr d0, OLD_ALPHAR
|
||||
vldr d1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_M, M
|
||||
vstr d0 , ALPHA_R
|
||||
|
|
|
@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR [fp, #0 ]
|
||||
#define OLD_ALPHAI [fp, #8 ]
|
||||
#define OLD_A_SOFTFP [fp, #16]
|
||||
#define OLD_LDA [fp, #20]
|
||||
#define X [fp, #24]
|
||||
#define OLD_INC_X [fp, #28]
|
||||
#define Y [fp, #32]
|
||||
#define OLD_INC_Y [fp, #36]
|
||||
#else
|
||||
#define OLD_LDA [fp, #0 ]
|
||||
#define X [fp, #4 ]
|
||||
#define OLD_INC_X [fp, #8 ]
|
||||
#define Y [fp, #12 ]
|
||||
#define OLD_INC_Y [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define OLD_A r3
|
||||
#define OLD_N r1
|
||||
|
||||
|
@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
|
@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
cmp OLD_N, #0
|
||||
ble zgemvt_kernel_L999
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr d0, OLD_ALPHAR
|
||||
vldr d1, OLD_ALPHAI
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
|
||||
str OLD_A, A
|
||||
str OLD_N, N
|
||||
|
||||
|
|
|
@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#define OFFSET [fp, #36 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define KMAC_R fmacd
|
||||
#define KMAC_I fnmacd
|
||||
#define KMAC_I vmls.f64
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define KMAC_R fnmacd
|
||||
#define KMAC_R vmls.f64
|
||||
#define KMAC_I fmacd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I1 vmls.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#endif
|
||||
|
@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ALPHA_I [fp, #-272]
|
||||
#define ALPHA_R [fp, #-280]
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#define OLD_ALPHAR_SOFTFP [fp, #4]
|
||||
#define OLD_ALPHAI_SOFTFP [fp, #12]
|
||||
#define OLD_A_SOFTFP [fp, #20 ]
|
||||
#define B [fp, #24 ]
|
||||
#define C [fp, #28 ]
|
||||
#define OLD_LDC [fp, #32 ]
|
||||
#define OFFSET [fp, #36 ]
|
||||
#else
|
||||
#define B [fp, #4 ]
|
||||
#define C [fp, #8 ]
|
||||
#define OLD_LDC [fp, #12 ]
|
||||
#define OFFSET [fp, #16 ]
|
||||
#endif
|
||||
|
||||
#define I r0
|
||||
#define J r1
|
||||
|
@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmuld
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R1 vnmul.f64
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmuld
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
|
@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define FMAC_R1 fmuld
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmuld
|
||||
#define FMAC_I1 vnmul.f64
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_I fsubd
|
||||
|
||||
#define FMAC_R1 fmuld
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_R2 vmls.f64
|
||||
#define FMAC_I1 fmuld
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
|
@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmuld
|
||||
#define FMAC_R1 vnmul.f64
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmuld
|
||||
#define FMAC_I2 fnmacd
|
||||
#define FMAC_I1 vnmul.f64
|
||||
#define FMAC_I2 vmls.f64
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
add fp, sp, #24
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
|
||||
vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
|
||||
ldr OLD_A, OLD_A_SOFTFP
|
||||
#endif
|
||||
str OLD_M, M
|
||||
str OLD_N, N
|
||||
str OLD_K, K
|
||||
|
|
|
@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
|
|||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
|
|||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
|
||||
"xvabssp %x3, 44 \n\t"
|
||||
"xvabssp %x4, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
|
||||
"xvabssp %x5, 46 \n\t"
|
||||
"xvabssp %x6, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
|
|
|
@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
|||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %5, %2 \n\t"
|
||||
"lxvw4x 34, %6, %2 \n\t"
|
||||
"lxvw4x 35, %7, %2 \n\t"
|
||||
"lxvw4x 36, %8, %2 \n\t"
|
||||
"lxvw4x 37, %9, %2 \n\t"
|
||||
"lxvw4x 38, %10, %2 \n\t"
|
||||
"lxvw4x 39, %11, %2 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %5, %2 \n\t"
|
||||
"lxvd2x 34, %6, %2 \n\t"
|
||||
"lxvd2x 35, %7, %2 \n\t"
|
||||
"lxvd2x 36, %8, %2 \n\t"
|
||||
"lxvd2x 37, %9, %2 \n\t"
|
||||
"lxvd2x 38, %10, %2 \n\t"
|
||||
"lxvd2x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
|||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %5, %2 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"lxvw4x 34, %6, %2 \n\t"
|
||||
"lxvw4x 35, %7, %2 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"lxvw4x 36, %8, %2 \n\t"
|
||||
"lxvw4x 37, %9, %2 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"lxvw4x 38, %10, %2 \n\t"
|
||||
"lxvw4x 39, %11, %2 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %5, %2 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"lxvd2x 34, %6, %2 \n\t"
|
||||
"lxvd2x 35, %7, %2 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"lxvd2x 36, %8, %2 \n\t"
|
||||
"lxvd2x 37, %9, %2 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
"lxvd2x 38, %10, %2 \n\t"
|
||||
"lxvd2x 39, %11, %2 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
|||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||
:
|
||||
|
|
|
@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
|
|||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %4 \n\t"
|
||||
"lxvw4x 33, %5, %4 \n\t"
|
||||
"lxvw4x 34, %6, %4 \n\t"
|
||||
"lxvw4x 35, %7, %4 \n\t"
|
||||
"lxvw4x 36, %8, %4 \n\t"
|
||||
"lxvw4x 37, %9, %4 \n\t"
|
||||
"lxvw4x 38, %10, %4 \n\t"
|
||||
"lxvw4x 39, %11, %4 \n\t"
|
||||
"lxvd2x 32, 0, %4 \n\t"
|
||||
"lxvd2x 33, %5, %4 \n\t"
|
||||
"lxvd2x 34, %6, %4 \n\t"
|
||||
"lxvd2x 35, %7, %4 \n\t"
|
||||
"lxvd2x 36, %8, %4 \n\t"
|
||||
"lxvd2x 37, %9, %4 \n\t"
|
||||
"lxvd2x 38, %10, %4 \n\t"
|
||||
"lxvd2x 39, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %4 \n\t"
|
||||
"lxvw4x 41, %5, %4 \n\t"
|
||||
"lxvw4x 42, %6, %4 \n\t"
|
||||
"lxvw4x 43, %7, %4 \n\t"
|
||||
"lxvw4x 44, %8, %4 \n\t"
|
||||
"lxvw4x 45, %9, %4 \n\t"
|
||||
"lxvw4x 46, %10, %4 \n\t"
|
||||
"lxvw4x 47, %11, %4 \n\t"
|
||||
"lxvd2x 40, 0, %4 \n\t"
|
||||
"lxvd2x 41, %5, %4 \n\t"
|
||||
"lxvd2x 42, %6, %4 \n\t"
|
||||
"lxvd2x 43, %7, %4 \n\t"
|
||||
"lxvd2x 44, %8, %4 \n\t"
|
||||
"lxvd2x 45, %9, %4 \n\t"
|
||||
"lxvd2x 46, %10, %4 \n\t"
|
||||
"lxvd2x 47, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, -128 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"lxvw4x 49, %5, %3 \n\t"
|
||||
"lxvw4x 50, %6, %3 \n\t"
|
||||
"lxvw4x 51, %7, %3 \n\t"
|
||||
"lxvw4x 0, %8, %3 \n\t"
|
||||
"lxvw4x 1, %9, %3 \n\t"
|
||||
"lxvw4x 2, %10, %3 \n\t"
|
||||
"lxvw4x 3, %11, %3 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 49, %5, %3 \n\t"
|
||||
"lxvd2x 50, %6, %3 \n\t"
|
||||
"lxvd2x 51, %7, %3 \n\t"
|
||||
"lxvd2x 0, %8, %3 \n\t"
|
||||
"lxvd2x 1, %9, %3 \n\t"
|
||||
"lxvd2x 2, %10, %3 \n\t"
|
||||
"lxvd2x 3, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"lxvw4x 4, 0, %3 \n\t"
|
||||
"lxvw4x 5, %5, %3 \n\t"
|
||||
"lxvw4x 6, %6, %3 \n\t"
|
||||
"lxvw4x 7, %7, %3 \n\t"
|
||||
"lxvw4x 8, %8, %3 \n\t"
|
||||
"lxvw4x 9, %9, %3 \n\t"
|
||||
"lxvw4x 10, %10, %3 \n\t"
|
||||
"lxvw4x 11, %11, %3 \n\t"
|
||||
"lxvd2x 4, 0, %3 \n\t"
|
||||
"lxvd2x 5, %5, %3 \n\t"
|
||||
"lxvd2x 6, %6, %3 \n\t"
|
||||
"lxvd2x 7, %7, %3 \n\t"
|
||||
"lxvd2x 8, %8, %3 \n\t"
|
||||
"lxvd2x 9, %9, %3 \n\t"
|
||||
"lxvd2x 10, %10, %3 \n\t"
|
||||
"lxvd2x 11, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, -128 \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 48, 0, %4 \n\t"
|
||||
"stxvw4x 49, %5, %4 \n\t"
|
||||
"stxvw4x 50, %6, %4 \n\t"
|
||||
"stxvw4x 51, %7, %4 \n\t"
|
||||
"stxvw4x 0, %8, %4 \n\t"
|
||||
"stxvw4x 1, %9, %4 \n\t"
|
||||
"stxvw4x 2, %10, %4 \n\t"
|
||||
"stxvw4x 3, %11, %4 \n\t"
|
||||
"stxvd2x 48, 0, %4 \n\t"
|
||||
"stxvd2x 49, %5, %4 \n\t"
|
||||
"stxvd2x 50, %6, %4 \n\t"
|
||||
"stxvd2x 51, %7, %4 \n\t"
|
||||
"stxvd2x 0, %8, %4 \n\t"
|
||||
"stxvd2x 1, %9, %4 \n\t"
|
||||
"stxvd2x 2, %10, %4 \n\t"
|
||||
"stxvd2x 3, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxvw4x 4, 0, %4 \n\t"
|
||||
"stxvw4x 5, %5, %4 \n\t"
|
||||
"stxvw4x 6, %6, %4 \n\t"
|
||||
"stxvw4x 7, %7, %4 \n\t"
|
||||
"stxvw4x 8, %8, %4 \n\t"
|
||||
"stxvw4x 9, %9, %4 \n\t"
|
||||
"stxvw4x 10, %10, %4 \n\t"
|
||||
"stxvw4x 11, %11, %4 \n\t"
|
||||
"stxvd2x 4, 0, %4 \n\t"
|
||||
"stxvd2x 5, %5, %4 \n\t"
|
||||
"stxvd2x 6, %6, %4 \n\t"
|
||||
"stxvd2x 7, %7, %4 \n\t"
|
||||
"stxvd2x 8, %8, %4 \n\t"
|
||||
"stxvd2x 9, %9, %4 \n\t"
|
||||
"stxvd2x 10, %10, %4 \n\t"
|
||||
"stxvd2x 11, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
|
|
|
@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
|
|||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
|
|||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %8, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %8, %2 \n\t"
|
||||
|
||||
"xvabssp %x3, 44 \n\t"
|
||||
"xvabssp %x4, 45 \n\t"
|
||||
|
||||
"lxvw4x 42, %9, %2 \n\t"
|
||||
"lxvw4x 43, %10, %2 \n\t"
|
||||
"lxvd2x 42, %9, %2 \n\t"
|
||||
"lxvd2x 43, %10, %2 \n\t"
|
||||
|
||||
"xvabssp %x5, 46 \n\t"
|
||||
"xvabssp %x6, 47 \n\t"
|
||||
|
||||
"lxvw4x 44, %11, %2 \n\t"
|
||||
"lxvw4x 45, %12, %2 \n\t"
|
||||
"lxvd2x 44, %11, %2 \n\t"
|
||||
"lxvd2x 45, %12, %2 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvw4x 46, %13, %2 \n\t"
|
||||
"lxvw4x 47, %14, %2 \n\t"
|
||||
"lxvd2x 46, %13, %2 \n\t"
|
||||
"lxvd2x 47, %14, %2 \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
|
|
|
@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
|||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
|||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 41, %5, %2 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"lxvw4x 42, %6, %2 \n\t"
|
||||
"lxvw4x 43, %7, %2 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"lxvw4x 44, %8, %2 \n\t"
|
||||
"lxvw4x 45, %9, %2 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n\t"
|
||||
"lxvw4x 46, %10, %2 \n\t"
|
||||
"lxvw4x 47, %11, %2 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 41, %5, %2 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"lxvd2x 42, %6, %2 \n\t"
|
||||
"lxvd2x 43, %7, %2 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"lxvd2x 44, %8, %2 \n\t"
|
||||
"lxvd2x 45, %9, %2 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n\t"
|
||||
"lxvd2x 46, %10, %2 \n\t"
|
||||
"lxvd2x 47, %11, %2 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
|
|||
|
||||
"2: \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t"
|
||||
"stxvw4x 41, %5, %3 \n\t"
|
||||
"stxvw4x 42, %6, %3 \n\t"
|
||||
"stxvw4x 43, %7, %3 \n\t"
|
||||
"stxvw4x 44, %8, %3 \n\t"
|
||||
"stxvw4x 45, %9, %3 \n\t"
|
||||
"stxvw4x 46, %10, %3 \n\t"
|
||||
"stxvw4x 47, %11, %3 \n"
|
||||
"stxvd2x 40, 0, %3 \n\t"
|
||||
"stxvd2x 41, %5, %3 \n\t"
|
||||
"stxvd2x 42, %6, %3 \n\t"
|
||||
"stxvd2x 43, %7, %3 \n\t"
|
||||
"stxvd2x 44, %8, %3 \n\t"
|
||||
"stxvd2x 45, %9, %3 \n\t"
|
||||
"stxvd2x 46, %10, %3 \n\t"
|
||||
"stxvd2x 47, %11, %3 \n"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
|
||||
:
|
||||
|
|
|
@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
|||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"lxvw4x 41, %10, %2 \n\t"
|
||||
"lxvw4x 49, %10, %3 \n\t"
|
||||
"lxvw4x 42, %11, %2 \n\t"
|
||||
"lxvw4x 50, %11, %3 \n\t"
|
||||
"lxvw4x 43, %12, %2 \n\t"
|
||||
"lxvw4x 51, %12, %3 \n\t"
|
||||
"lxvw4x 44, %13, %2 \n\t"
|
||||
"lxvw4x %x4, %13, %3 \n\t"
|
||||
"lxvw4x 45, %14, %2 \n\t"
|
||||
"lxvw4x %x5, %14, %3 \n\t"
|
||||
"lxvw4x 46, %15, %2 \n\t"
|
||||
"lxvw4x %x6, %15, %3 \n\t"
|
||||
"lxvw4x 47, %16, %2 \n\t"
|
||||
"lxvw4x %x7, %16, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"lxvd2x 41, %10, %2 \n\t"
|
||||
"lxvd2x 49, %10, %3 \n\t"
|
||||
"lxvd2x 42, %11, %2 \n\t"
|
||||
"lxvd2x 50, %11, %3 \n\t"
|
||||
"lxvd2x 43, %12, %2 \n\t"
|
||||
"lxvd2x 51, %12, %3 \n\t"
|
||||
"lxvd2x 44, %13, %2 \n\t"
|
||||
"lxvd2x %x4, %13, %3 \n\t"
|
||||
"lxvd2x 45, %14, %2 \n\t"
|
||||
"lxvd2x %x5, %14, %3 \n\t"
|
||||
"lxvd2x 46, %15, %2 \n\t"
|
||||
"lxvd2x %x6, %15, %3 \n\t"
|
||||
"lxvd2x 47, %16, %2 \n\t"
|
||||
"lxvd2x %x7, %16, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
|
|||
"1: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t"
|
||||
"lxvw4x 40, 0, %2 \n\t"
|
||||
"lxvw4x 48, 0, %3 \n\t"
|
||||
"lxvd2x 40, 0, %2 \n\t"
|
||||
"lxvd2x 48, 0, %3 \n\t"
|
||||
"xvmaddasp 33, 41, 49 \n\t"
|
||||
"lxvw4x 41, %10, %2 \n\t"
|
||||
"lxvw4x 49, %10, %3 \n\t"
|
||||
"lxvd2x 41, %10, %2 \n\t"
|
||||
"lxvd2x 49, %10, %3 \n\t"
|
||||
"xvmaddasp 34, 42, 50 \n\t"
|
||||
"lxvw4x 42, %11, %2 \n\t"
|
||||
"lxvw4x 50, %11, %3 \n\t"
|
||||
"lxvd2x 42, %11, %2 \n\t"
|
||||
"lxvd2x 50, %11, %3 \n\t"
|
||||
"xvmaddasp 35, 43, 51 \n\t"
|
||||
"lxvw4x 43, %12, %2 \n\t"
|
||||
"lxvw4x 51, %12, %3 \n\t"
|
||||
"lxvd2x 43, %12, %2 \n\t"
|
||||
"lxvd2x 51, %12, %3 \n\t"
|
||||
"xvmaddasp 36, 44, %x4 \n\t"
|
||||
"lxvw4x 44, %13, %2 \n\t"
|
||||
"lxvw4x %x4, %13, %3 \n\t"
|
||||
"lxvd2x 44, %13, %2 \n\t"
|
||||
"lxvd2x %x4, %13, %3 \n\t"
|
||||
"xvmaddasp 37, 45, %x5 \n\t"
|
||||
"lxvw4x 45, %14, %2 \n\t"
|
||||
"lxvw4x %x5, %14, %3 \n\t"
|
||||
"lxvd2x 45, %14, %2 \n\t"
|
||||
"lxvd2x %x5, %14, %3 \n\t"
|
||||
"xvmaddasp 38, 46, %x6 \n\t"
|
||||
"lxvw4x 46, %15, %2 \n\t"
|
||||
"lxvw4x %x6, %15, %3 \n\t"
|
||||
"lxvd2x 46, %15, %2 \n\t"
|
||||
"lxvd2x %x6, %15, %3 \n\t"
|
||||
"xvmaddasp 39, 47, %x7 \n\t"
|
||||
"lxvw4x 47, %16, %2 \n\t"
|
||||
"lxvw4x %x7, %16, %3 \n\t"
|
||||
"lxvd2x 47, %16, %2 \n\t"
|
||||
"lxvd2x %x7, %16, %3 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
|
|
@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
|||
"xscvdpspn 37, %x14 \n\t" // load s to all words
|
||||
"xxspltw 37, 37, 0 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %3 \n\t" // load x
|
||||
"lxvw4x 33, %15, %3 \n\t"
|
||||
"lxvw4x 34, %16, %3 \n\t"
|
||||
"lxvw4x 35, %17, %3 \n\t"
|
||||
"lxvd2x 32, 0, %3 \n\t" // load x
|
||||
"lxvd2x 33, %15, %3 \n\t"
|
||||
"lxvd2x 34, %16, %3 \n\t"
|
||||
"lxvd2x 35, %17, %3 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %4 \n\t" // load y
|
||||
"lxvw4x 49, %15, %4 \n\t"
|
||||
"lxvw4x 50, %16, %4 \n\t"
|
||||
"lxvw4x 51, %17, %4 \n\t"
|
||||
"lxvd2x 48, 0, %4 \n\t" // load y
|
||||
"lxvd2x 49, %15, %4 \n\t"
|
||||
"lxvd2x 50, %16, %4 \n\t"
|
||||
"lxvd2x 51, %17, %4 \n\t"
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"addi %4, %4, 64 \n\t"
|
||||
|
@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
|||
"xvmulsp 44, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 45, 33, 37 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %3 \n\t" // load x
|
||||
"lxvw4x 33, %15, %3 \n\t"
|
||||
"lxvd2x 32, 0, %3 \n\t" // load x
|
||||
"lxvd2x 33, %15, %3 \n\t"
|
||||
|
||||
"xvmulsp 46, 34, 37 \n\t"
|
||||
"xvmulsp 47, 35, 37 \n\t"
|
||||
|
||||
"lxvw4x 34, %16, %3 \n\t"
|
||||
"lxvw4x 35, %17, %3 \n\t"
|
||||
"lxvd2x 34, %16, %3 \n\t"
|
||||
"lxvd2x 35, %17, %3 \n\t"
|
||||
|
||||
"xvmulsp %x9, 48, 37 \n\t" // s * y
|
||||
"xvmulsp %x10, 49, 37 \n\t"
|
||||
|
||||
"lxvw4x 48, 0, %4 \n\t" // load y
|
||||
"lxvw4x 49, %15, %4 \n\t"
|
||||
"lxvd2x 48, 0, %4 \n\t" // load y
|
||||
"lxvd2x 49, %15, %4 \n\t"
|
||||
|
||||
"xvmulsp %x11, 50, 37 \n\t"
|
||||
"xvmulsp %x12, 51, 37 \n\t"
|
||||
|
||||
"lxvw4x 50, %16, %4 \n\t"
|
||||
"lxvw4x 51, %17, %4 \n\t"
|
||||
"lxvd2x 50, %16, %4 \n\t"
|
||||
"lxvd2x 51, %17, %4 \n\t"
|
||||
|
||||
"xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
|
||||
"xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
|
||||
|
@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
|||
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
|
||||
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t" // store x
|
||||
"stxvw4x 41, %15, %3 \n\t"
|
||||
"stxvw4x 42, %16, %3 \n\t"
|
||||
"stxvw4x 43, %17, %3 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t" // store x
|
||||
"stxvd2x 41, %15, %3 \n\t"
|
||||
"stxvd2x 42, %16, %3 \n\t"
|
||||
"stxvd2x 43, %17, %3 \n\t"
|
||||
|
||||
"stxvw4x %x5, 0, %4 \n\t" // store y
|
||||
"stxvw4x %x6, %15, %4 \n\t"
|
||||
"stxvw4x %x7, %16, %4 \n\t"
|
||||
"stxvw4x %x8, %17, %4 \n\t"
|
||||
"stxvd2x %x5, 0, %4 \n\t" // store y
|
||||
"stxvd2x %x6, %15, %4 \n\t"
|
||||
"stxvd2x %x7, %16, %4 \n\t"
|
||||
"stxvd2x %x8, %17, %4 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
|||
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
|
||||
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvw4x 40, 0, %3 \n\t" // store x
|
||||
"stxvw4x 41, %15, %3 \n\t"
|
||||
"stxvw4x 42, %16, %3 \n\t"
|
||||
"stxvw4x 43, %17, %3 \n\t"
|
||||
"stxvd2x 40, 0, %3 \n\t" // store x
|
||||
"stxvd2x 41, %15, %3 \n\t"
|
||||
"stxvd2x 42, %16, %3 \n\t"
|
||||
"stxvd2x 43, %17, %3 \n\t"
|
||||
|
||||
"stxvw4x %x5, 0, %4 \n\t" // store y
|
||||
"stxvw4x %x6, %15, %4 \n\t"
|
||||
"stxvw4x %x7, %16, %4 \n\t"
|
||||
"stxvw4x %x8, %17, %4 \n"
|
||||
"stxvd2x %x5, 0, %4 \n\t" // store y
|
||||
"stxvd2x %x6, %15, %4 \n\t"
|
||||
"stxvd2x %x7, %16, %4 \n\t"
|
||||
"stxvd2x %x8, %17, %4 \n"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
|
||||
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
|
||||
|
|
|
@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
|||
"xscvdpspn %x3, %x3 \n\t"
|
||||
"xxspltw %x3, %x3, 0 \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %4, %2 \n\t"
|
||||
"lxvw4x 34, %5, %2 \n\t"
|
||||
"lxvw4x 35, %6, %2 \n\t"
|
||||
"lxvw4x 36, %7, %2 \n\t"
|
||||
"lxvw4x 37, %8, %2 \n\t"
|
||||
"lxvw4x 38, %9, %2 \n\t"
|
||||
"lxvw4x 39, %10, %2 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %4, %2 \n\t"
|
||||
"lxvd2x 34, %5, %2 \n\t"
|
||||
"lxvd2x 35, %6, %2 \n\t"
|
||||
"lxvd2x 36, %7, %2 \n\t"
|
||||
"lxvd2x 37, %8, %2 \n\t"
|
||||
"lxvd2x 38, %9, %2 \n\t"
|
||||
"lxvd2x 39, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
|||
|
||||
"xvmulsp 40, 32, %x3 \n\t"
|
||||
"xvmulsp 41, 33, %x3 \n\t"
|
||||
"lxvw4x 32, 0, %2 \n\t"
|
||||
"lxvw4x 33, %4, %2 \n\t"
|
||||
"lxvd2x 32, 0, %2 \n\t"
|
||||
"lxvd2x 33, %4, %2 \n\t"
|
||||
"xvmulsp 42, 34, %x3 \n\t"
|
||||
"xvmulsp 43, 35, %x3 \n\t"
|
||||
"lxvw4x 34, %5, %2 \n\t"
|
||||
"lxvw4x 35, %6, %2 \n\t"
|
||||
"lxvd2x 34, %5, %2 \n\t"
|
||||
"lxvd2x 35, %6, %2 \n\t"
|
||||
"xvmulsp 44, 36, %x3 \n\t"
|
||||
"xvmulsp 45, 37, %x3 \n\t"
|
||||
"lxvw4x 36, %7, %2 \n\t"
|
||||
"lxvw4x 37, %8, %2 \n\t"
|
||||
"lxvd2x 36, %7, %2 \n\t"
|
||||
"lxvd2x 37, %8, %2 \n\t"
|
||||
"xvmulsp 46, 38, %x3 \n\t"
|
||||
"xvmulsp 47, 39, %x3 \n\t"
|
||||
"lxvw4x 38, %9, %2 \n\t"
|
||||
"lxvw4x 39, %10, %2 \n\t"
|
||||
"lxvd2x 38, %9, %2 \n\t"
|
||||
"lxvd2x 39, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, -128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %2 \n\t"
|
||||
"stxvw4x 41, %4, %2 \n\t"
|
||||
"stxvw4x 42, %5, %2 \n\t"
|
||||
"stxvw4x 43, %6, %2 \n\t"
|
||||
"stxvw4x 44, %7, %2 \n\t"
|
||||
"stxvw4x 45, %8, %2 \n\t"
|
||||
"stxvw4x 46, %9, %2 \n\t"
|
||||
"stxvw4x 47, %10, %2 \n\t"
|
||||
"stxvd2x 40, 0, %2 \n\t"
|
||||
"stxvd2x 41, %4, %2 \n\t"
|
||||
"stxvd2x 42, %5, %2 \n\t"
|
||||
"stxvd2x 43, %6, %2 \n\t"
|
||||
"stxvd2x 44, %7, %2 \n\t"
|
||||
"stxvd2x 45, %8, %2 \n\t"
|
||||
"stxvd2x 46, %9, %2 \n\t"
|
||||
"stxvd2x 47, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 256 \n\t"
|
||||
|
||||
|
@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
|
|||
"xvmulsp 46, 38, %x3 \n\t"
|
||||
"xvmulsp 47, 39, %x3 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %2 \n\t"
|
||||
"stxvw4x 41, %4, %2 \n\t"
|
||||
"stxvw4x 42, %5, %2 \n\t"
|
||||
"stxvw4x 43, %6, %2 \n\t"
|
||||
"stxvw4x 44, %7, %2 \n\t"
|
||||
"stxvw4x 45, %8, %2 \n\t"
|
||||
"stxvw4x 46, %9, %2 \n\t"
|
||||
"stxvw4x 47, %10, %2 \n"
|
||||
"stxvd2x 40, 0, %2 \n\t"
|
||||
"stxvd2x 41, %4, %2 \n\t"
|
||||
"stxvd2x 42, %5, %2 \n\t"
|
||||
"stxvd2x 43, %6, %2 \n\t"
|
||||
"stxvd2x 44, %7, %2 \n\t"
|
||||
"stxvd2x 45, %8, %2 \n\t"
|
||||
"stxvd2x 46, %9, %2 \n\t"
|
||||
"stxvd2x 47, %10, %2 \n"
|
||||
|
||||
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
|
||||
:
|
||||
|
@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
|
|||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"stxvw4x %x3, 0, %2 \n\t"
|
||||
"stxvw4x %x3, %4, %2 \n\t"
|
||||
"stxvw4x %x3, %5, %2 \n\t"
|
||||
"stxvw4x %x3, %6, %2 \n\t"
|
||||
"stxvw4x %x3, %7, %2 \n\t"
|
||||
"stxvw4x %x3, %8, %2 \n\t"
|
||||
"stxvw4x %x3, %9, %2 \n\t"
|
||||
"stxvw4x %x3, %10, %2 \n\t"
|
||||
"stxvd2x %x3, 0, %2 \n\t"
|
||||
"stxvd2x %x3, %4, %2 \n\t"
|
||||
"stxvd2x %x3, %5, %2 \n\t"
|
||||
"stxvd2x %x3, %6, %2 \n\t"
|
||||
"stxvd2x %x3, %7, %2 \n\t"
|
||||
"stxvd2x %x3, %8, %2 \n\t"
|
||||
"stxvd2x %x3, %9, %2 \n\t"
|
||||
"stxvd2x %x3, %10, %2 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
|
|
@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
|
|||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"lxvw4x 32, 0, %4 \n\t"
|
||||
"lxvw4x 33, %5, %4 \n\t"
|
||||
"lxvw4x 34, %6, %4 \n\t"
|
||||
"lxvw4x 35, %7, %4 \n\t"
|
||||
"lxvw4x 36, %8, %4 \n\t"
|
||||
"lxvw4x 37, %9, %4 \n\t"
|
||||
"lxvw4x 38, %10, %4 \n\t"
|
||||
"lxvw4x 39, %11, %4 \n\t"
|
||||
"lxvd2x 32, 0, %4 \n\t"
|
||||
"lxvd2x 33, %5, %4 \n\t"
|
||||
"lxvd2x 34, %6, %4 \n\t"
|
||||
"lxvd2x 35, %7, %4 \n\t"
|
||||
"lxvd2x 36, %8, %4 \n\t"
|
||||
"lxvd2x 37, %9, %4 \n\t"
|
||||
"lxvd2x 38, %10, %4 \n\t"
|
||||
"lxvd2x 39, %11, %4 \n\t"
|
||||
|
||||
"lxvw4x 40, 0, %3 \n\t"
|
||||
"lxvw4x 41, %5, %3 \n\t"
|
||||
"lxvw4x 42, %6, %3 \n\t"
|
||||
"lxvw4x 43, %7, %3 \n\t"
|
||||
"lxvw4x 44, %8, %3 \n\t"
|
||||
"lxvw4x 45, %9, %3 \n\t"
|
||||
"lxvw4x 46, %10, %3 \n\t"
|
||||
"lxvw4x 47, %11, %3 \n\t"
|
||||
"lxvd2x 40, 0, %3 \n\t"
|
||||
"lxvd2x 41, %5, %3 \n\t"
|
||||
"lxvd2x 42, %6, %3 \n\t"
|
||||
"lxvd2x 43, %7, %3 \n\t"
|
||||
"lxvd2x 44, %8, %3 \n\t"
|
||||
"lxvd2x 45, %9, %3 \n\t"
|
||||
"lxvd2x 46, %10, %3 \n\t"
|
||||
"lxvd2x 47, %11, %3 \n\t"
|
||||
|
||||
"stxvw4x 32, 0, %3 \n\t"
|
||||
"stxvw4x 33, %5, %3 \n\t"
|
||||
"stxvw4x 34, %6, %3 \n\t"
|
||||
"stxvw4x 35, %7, %3 \n\t"
|
||||
"stxvw4x 36, %8, %3 \n\t"
|
||||
"stxvw4x 37, %9, %3 \n\t"
|
||||
"stxvw4x 38, %10, %3 \n\t"
|
||||
"stxvw4x 39, %11, %3 \n\t"
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
"stxvd2x 34, %6, %3 \n\t"
|
||||
"stxvd2x 35, %7, %3 \n\t"
|
||||
"stxvd2x 36, %8, %3 \n\t"
|
||||
"stxvd2x 37, %9, %3 \n\t"
|
||||
"stxvd2x 38, %10, %3 \n\t"
|
||||
"stxvd2x 39, %11, %3 \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxvw4x 40, 0, %4 \n\t"
|
||||
"stxvw4x 41, %5, %4 \n\t"
|
||||
"stxvw4x 42, %6, %4 \n\t"
|
||||
"stxvw4x 43, %7, %4 \n\t"
|
||||
"stxvw4x 44, %8, %4 \n\t"
|
||||
"stxvw4x 45, %9, %4 \n\t"
|
||||
"stxvw4x 46, %10, %4 \n\t"
|
||||
"stxvw4x 47, %11, %4 \n\t"
|
||||
"stxvd2x 40, 0, %4 \n\t"
|
||||
"stxvd2x 41, %5, %4 \n\t"
|
||||
"stxvd2x 42, %6, %4 \n\t"
|
||||
"stxvd2x 43, %7, %4 \n\t"
|
||||
"stxvd2x 44, %8, %4 \n\t"
|
||||
"stxvd2x 45, %9, %4 \n\t"
|
||||
"stxvd2x 46, %10, %4 \n\t"
|
||||
"stxvd2x 47, %11, %4 \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
|
|
|
@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
|||
target_link_libraries(${OpenBLAS_utest_bin} m)
|
||||
endif()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
|
||||
set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS")
|
||||
endif()
|
||||
|
||||
#Set output for utest
|
||||
set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
|
|
Loading…
Reference in New Issue